]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/intltest/rbbitst.cpp
ICU-57131.0.1.tar.gz
[apple/icu.git] / icuSources / test / intltest / rbbitst.cpp
1 /********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 1999-2016, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6 /************************************************************************
7 * Date Name Description
8 * 12/15/99 Madhu Creation.
9 * 01/12/2000 Madhu Updated for changed API and added new tests
10 ************************************************************************/
11
12 #include "unicode/utypes.h"
13 #if !UCONFIG_NO_BREAK_ITERATION
14
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <string.h>
18
19 #include "unicode/brkiter.h"
20 #include "unicode/localpointer.h"
21 #include "unicode/numfmt.h"
22 #include "unicode/rbbi.h"
23 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
24 #include "unicode/regex.h"
25 #endif
26 #include "unicode/schriter.h"
27 #include "unicode/uchar.h"
28 #include "unicode/utf16.h"
29 #include "unicode/ucnv.h"
30 #include "unicode/uniset.h"
31 #include "unicode/uscript.h"
32 #include "unicode/ustring.h"
33 #include "unicode/utext.h"
34
35 #include "charstr.h"
36 #include "cmemory.h"
37 #include "intltest.h"
38 #include "rbbitst.h"
39 #include "utypeinfo.h" // for 'typeid' to work
40 #include "uvector.h"
41 #include "uvectr32.h"
42
43 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
44 #include "unicode/filteredbrk.h"
45 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
46
47 #define TEST_ASSERT(x) {if (!(x)) { \
48 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
49
50 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
51 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
52
53
54 //---------------------------------------------
55 // runIndexedTest
56 //---------------------------------------------
57
58
59 // Note: Before adding new tests to this file, check whether the desired test data can
60 // simply be added to the file testdata/rbbitest.txt. In most cases it can,
61 // it's much less work than writing a new test, diagnostic output in the event of failures
62 // is good, and the test data file will is shared with ICU4J, so eventually the test
63 // will run there as well, without additional effort.
64
65 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
66 {
67 if (exec) logln("TestSuite RuleBasedBreakIterator: ");
68
69 switch (index) {
70 #if !UCONFIG_NO_FILE_IO
71 case 0: name = "TestBug4153072";
72 if(exec) TestBug4153072(); break;
73 #else
74 case 0: name = "skip";
75 break;
76 #endif
77
78 case 1: name = "skip";
79 break;
80 case 2: name = "TestStatusReturn";
81 if(exec) TestStatusReturn(); break;
82
83 #if !UCONFIG_NO_FILE_IO
84 case 3: name = "TestUnicodeFiles";
85 if(exec) TestUnicodeFiles(); break;
86 case 4: name = "TestEmptyString";
87 if(exec) TestEmptyString(); break;
88 #else
89 case 3: case 4: name = "skip";
90 break;
91 #endif
92
93 case 5: name = "TestGetAvailableLocales";
94 if(exec) TestGetAvailableLocales(); break;
95
96 case 6: name = "TestGetDisplayName";
97 if(exec) TestGetDisplayName(); break;
98
99 #if !UCONFIG_NO_FILE_IO
100 case 7: name = "TestEndBehaviour";
101 if(exec) TestEndBehaviour(); break;
102 case 8: case 9: case 10: name = "skip";
103 break;
104 case 11: name = "TestWordBreaks";
105 if(exec) TestWordBreaks(); break;
106 case 12: name = "TestWordBoundary";
107 if(exec) TestWordBoundary(); break;
108 case 13: name = "TestLineBreaks";
109 if(exec) TestLineBreaks(); break;
110 case 14: name = "TestSentBreaks";
111 if(exec) TestSentBreaks(); break;
112 case 15: name = "TestExtended";
113 if(exec) TestExtended(); break;
114 #else
115 case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip";
116 break;
117 #endif
118
119 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
120 case 16:
121 name = "TestMonkey"; if(exec) TestMonkey(params); break;
122 #else
123 case 16:
124 name = "skip"; break;
125 #endif
126
127 #if !UCONFIG_NO_FILE_IO
128 case 17: name = "TestBug3818";
129 if(exec) TestBug3818(); break;
130 #else
131 case 17: name = "skip";
132 break;
133 #endif
134
135 case 18: name = "skip";
136 break;
137 case 19: name = "TestDebug";
138 if(exec) TestDebug(); break;
139 case 20: name = "skip";
140 break;
141
142 #if !UCONFIG_NO_FILE_IO
143 case 21: name = "TestBug5775";
144 if (exec) TestBug5775(); break;
145 #else
146 case 21: name = "skip";
147 break;
148 #endif
149
150 case 22: name = "TestBug9983";
151 if (exec) TestBug9983(); break;
152 case 23: name = "TestDictRules";
153 if (exec) TestDictRules(); break;
154 case 24: name = "TestBug5532";
155 if (exec) TestBug5532(); break;
156 default: name = ""; break; //needed to end loop
157 }
158 }
159
160
161 //---------------------------------------------------------------------------
162 //
163 // class BITestData Holds a set of Break iterator test data and results
164 // Includes
165 // - the string data to be broken
166 // - a vector of the expected break positions.
167 // - a vector of source line numbers for the data,
168 // (to help see where errors occured.)
169 // - The expected break tag values.
170 // - Vectors of actual break positions and tag values.
171 // - Functions for comparing actual with expected and
172 // reporting errors.
173 //
174 //----------------------------------------------------------------------------
175 class BITestData {
176 public:
177 UnicodeString fDataToBreak;
178 UVector fExpectedBreakPositions;
179 UVector fExpectedTags;
180 UVector fLineNum;
181 UVector fActualBreakPositions; // Test Results.
182 UVector fActualTags;
183
184 BITestData(UErrorCode &status);
185 void addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
186 void checkResults(const char *heading, RBBITest *test);
187 void err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
188 void clearResults();
189 };
190
191 //
192 // Constructor.
193 //
194 BITestData::BITestData(UErrorCode &status)
195 : fExpectedBreakPositions(status), fExpectedTags(status), fLineNum(status), fActualBreakPositions(status),
196 fActualTags(status)
197 {
198 }
199
200 //
201 // addDataChunk. Add a section (non-breaking) piece if data to the test data.
202 // The macro form collects the line number, which is helpful
203 // when tracking down failures.
204 //
205 // A null data item is inserted at the start of each test's data
206 // to put the starting zero into the data list. The position saved for
207 // each non-null item is its ending position.
208 //
209 #define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status);
210 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
211 if (U_FAILURE(status)) {return;}
212 if (data != NULL) {
213 fDataToBreak.append(CharsToUnicodeString(data));
214 }
215 fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
216 fExpectedTags.addElement(tag, status);
217 fLineNum.addElement(lineNum, status);
218 }
219
220
221 //
222 // checkResults. Compare the actual and expected break positions, report any differences.
223 //
224 void BITestData::checkResults(const char *heading, RBBITest *test) {
225 int32_t expectedIndex = 0;
226 int32_t actualIndex = 0;
227
228 for (;;) {
229 // If we've run through both the expected and actual results vectors, we're done.
230 // break out of the loop.
231 if (expectedIndex >= fExpectedBreakPositions.size() &&
232 actualIndex >= fActualBreakPositions.size()) {
233 break;
234 }
235
236
237 if (expectedIndex >= fExpectedBreakPositions.size()) {
238 err(heading, test, expectedIndex-1, actualIndex);
239 actualIndex++;
240 continue;
241 }
242
243 if (actualIndex >= fActualBreakPositions.size()) {
244 err(heading, test, expectedIndex, actualIndex-1);
245 expectedIndex++;
246 continue;
247 }
248
249 if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
250 err(heading, test, expectedIndex, actualIndex);
251 // Try to resync the positions of the indices, to avoid a rash of spurious erros.
252 if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
253 actualIndex++;
254 } else {
255 expectedIndex++;
256 }
257 continue;
258 }
259
260 if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
261 test->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d",
262 heading, fLineNum.elementAt(expectedIndex),
263 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
264 }
265
266 actualIndex++;
267 expectedIndex++;
268 }
269 }
270
271 //
272 // err - An error was found. Report it, along with information about where the
273 // incorrectly broken test data appeared in the source file.
274 //
275 void BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
276 {
277 int32_t expected = fExpectedBreakPositions.elementAti(expectedIdx);
278 int32_t actual = fActualBreakPositions.elementAti(actualIdx);
279 int32_t o = 0;
280 int32_t line = fLineNum.elementAti(expectedIdx);
281 if (expectedIdx > 0) {
282 // The line numbers are off by one because a premature break occurs somewhere
283 // within the previous item, rather than at the start of the current (expected) item.
284 // We want to report the offset of the unexpected break from the start of
285 // this previous item.
286 o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
287 }
288 if (actual < expected) {
289 test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading, o, line, actual, expected);
290 } else {
291 test->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading, line, actual, expected);
292 }
293 }
294
295
296 void BITestData::clearResults() {
297 fActualBreakPositions.removeAllElements();
298 fActualTags.removeAllElements();
299 }
300
301
302 //--------------------------------------------------------------------------------------
303 //
304 // RBBITest constructor and destructor
305 //
306 //--------------------------------------------------------------------------------------
307
308 RBBITest::RBBITest() {
309 }
310
311
312 RBBITest::~RBBITest() {
313 }
314
315 //-----------------------------------------------------------------------------------
316 //
317 // Test for status {tag} return value from break rules.
318 // TODO: a more thorough test.
319 //
320 //-----------------------------------------------------------------------------------
321 void RBBITest::TestStatusReturn() {
322 UnicodeString rulesString1("$Letters = [:L:];\n"
323 "$Numbers = [:N:];\n"
324 "$Letters+{1};\n"
325 "$Numbers+{2};\n"
326 "Help\\ /me\\!{4};\n"
327 "[^$Letters $Numbers];\n"
328 "!.*;\n", -1, US_INV);
329 UnicodeString testString1 = "abc123..abc Help me Help me!";
330 // 01234567890123456789012345678
331 int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
332 int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1};
333
334 UErrorCode status=U_ZERO_ERROR;
335 UParseError parseError;
336
337 LocalPointer <BreakIterator> bi(new RuleBasedBreakIterator(rulesString1, parseError, status));
338 if(U_FAILURE(status)) {
339 dataerrln("%s:%d error in break iterator construction - %s", __FILE__, __LINE__, u_errorName(status));
340 return;
341 }
342 int32_t pos;
343 int32_t i = 0;
344 bi->setText(testString1);
345 for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
346 if (pos != bounds1[i]) {
347 errln("%s:%d expected break at %d, got %d\n", __FILE__, __LINE__, bounds1[i], pos);
348 break;
349 }
350
351 int tag = bi->getRuleStatus();
352 if (tag != brkStatus[i]) {
353 errln("%s:%d break at %d, expected tag %d, got tag %d\n", __FILE__, __LINE__, pos, brkStatus[i], tag);
354 break;
355 }
356 i++;
357 }
358 }
359
360
361 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
362 UErrorCode status = U_ZERO_ERROR;
363 char name[100];
364 printf("code alpha extend alphanum type word sent line name\n");
365 int nextExpectedIndex = 0;
366 utext_setNativeIndex(tstr, 0);
367 for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) {
368 if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
369 printf("------------------------------------------------ %d\n", j);
370 ++nextExpectedIndex;
371 }
372
373 UChar32 c = utext_next32(tstr);
374 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
375 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
376 u_isUAlphabetic(c),
377 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
378 u_isalnum(c),
379 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
380 u_charType(c),
381 U_SHORT_PROPERTY_NAME),
382 u_getPropertyValueName(UCHAR_WORD_BREAK,
383 u_getIntPropertyValue(c,
384 UCHAR_WORD_BREAK),
385 U_SHORT_PROPERTY_NAME),
386 u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
387 u_getIntPropertyValue(c,
388 UCHAR_SENTENCE_BREAK),
389 U_SHORT_PROPERTY_NAME),
390 u_getPropertyValueName(UCHAR_LINE_BREAK,
391 u_getIntPropertyValue(c,
392 UCHAR_LINE_BREAK),
393 U_SHORT_PROPERTY_NAME),
394 name);
395 }
396 }
397
398
399 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
400 UErrorCode status = U_ZERO_ERROR;
401 UText *tstr = NULL;
402 tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
403 if (U_FAILURE(status)) {
404 printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
405 return;
406 }
407 printStringBreaks(tstr, expected, expectedCount);
408 utext_close(tstr);
409 }
410
411
412 void RBBITest::TestBug3818() {
413 UErrorCode status = U_ZERO_ERROR;
414
415 // Four Thai words...
416 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
417 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
418 UnicodeString thaiStr(thaiWordData);
419
420 BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
421 if (U_FAILURE(status) || bi == NULL) {
422 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
423 return;
424 }
425 bi->setText(thaiStr);
426
427 int32_t startOfSecondWord = bi->following(1);
428 if (startOfSecondWord != 4) {
429 errln("Fail at file %s, line %d expected start of word at 4, got %d",
430 __FILE__, __LINE__, startOfSecondWord);
431 }
432 startOfSecondWord = bi->following(0);
433 if (startOfSecondWord != 4) {
434 errln("Fail at file %s, line %d expected start of word at 4, got %d",
435 __FILE__, __LINE__, startOfSecondWord);
436 }
437 delete bi;
438 }
439
440 //----------------------------------------------------------------------------
441 //
442 // generalIteratorTest Given a break iterator and a set of test data,
443 // Run the tests and report the results.
444 //
445 //----------------------------------------------------------------------------
446 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
447 {
448
449 bi.setText(td.fDataToBreak);
450
451 testFirstAndNext(bi, td);
452
453 testLastAndPrevious(bi, td);
454
455 testFollowing(bi, td);
456 testPreceding(bi, td);
457 testIsBoundary(bi, td);
458 doMultipleSelectionTest(bi, td);
459 }
460
461
462 //
463 // testFirstAndNext. Run the iterator forwards in the obvious first(), next()
464 // kind of loop.
465 //
466 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
467 {
468 UErrorCode status = U_ZERO_ERROR;
469 int32_t p;
470 int32_t lastP = -1;
471 int32_t tag;
472
473 logln("Test first and next");
474 bi.setText(td.fDataToBreak);
475 td.clearResults();
476
477 for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
478 td.fActualBreakPositions.addElement(p, status); // Save result.
479 tag = bi.getRuleStatus();
480 td.fActualTags.addElement(tag, status);
481 if (p <= lastP) {
482 // If the iterator is not making forward progress, stop.
483 // No need to raise an error here, it'll be detected in the normal check of results.
484 break;
485 }
486 lastP = p;
487 }
488 td.checkResults("testFirstAndNext", this);
489 }
490
491
492 //
493 // TestLastAndPrevious. Run the iterator backwards, starting with last().
494 //
495 void RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td)
496 {
497 UErrorCode status = U_ZERO_ERROR;
498 int32_t p;
499 int32_t lastP = 0x7ffffffe;
500 int32_t tag;
501
502 logln("Test last and previous");
503 bi.setText(td.fDataToBreak);
504 td.clearResults();
505
506 for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
507 // Save break position. Insert it at start of vector of results, shoving
508 // already-saved results further towards the end.
509 td.fActualBreakPositions.insertElementAt(p, 0, status);
510 // bi.previous(); // TODO: Why does this fix things up????
511 // bi.next();
512 tag = bi.getRuleStatus();
513 td.fActualTags.insertElementAt(tag, 0, status);
514 if (p >= lastP) {
515 // If the iterator is not making progress, stop.
516 // No need to raise an error here, it'll be detected in the normal check of results.
517 break;
518 }
519 lastP = p;
520 }
521 td.checkResults("testLastAndPrevious", this);
522 }
523
524
525 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
526 {
527 UErrorCode status = U_ZERO_ERROR;
528 int32_t p;
529 int32_t tag;
530 int32_t lastP = -2; // A value that will never be returned as a break position.
531 // cannot be -1; that is returned for DONE.
532 int i;
533
534 logln("testFollowing():");
535 bi.setText(td.fDataToBreak);
536 td.clearResults();
537
538 // Save the starting point, since we won't get that out of following.
539 p = bi.first();
540 td.fActualBreakPositions.addElement(p, status); // Save result.
541 tag = bi.getRuleStatus();
542 td.fActualTags.addElement(tag, status);
543
544 for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
545 p = bi.following(i);
546 if (p != lastP) {
547 if (p == RuleBasedBreakIterator::DONE) {
548 break;
549 }
550 // We've reached a new break position. Save it.
551 td.fActualBreakPositions.addElement(p, status); // Save result.
552 tag = bi.getRuleStatus();
553 td.fActualTags.addElement(tag, status);
554 lastP = p;
555 }
556 }
557 // The loop normally exits by means of the break in the middle.
558 // Make sure that the index was at the correct position for the break iterator to have
559 // returned DONE.
560 if (i != td.fDataToBreak.length()) {
561 errln("testFollowing(): iterator returned DONE prematurely.");
562 }
563
564 // Full check of all results.
565 td.checkResults("testFollowing", this);
566 }
567
568
569
570 void RBBITest::testPreceding(RuleBasedBreakIterator& bi, BITestData &td) {
571 UErrorCode status = U_ZERO_ERROR;
572 int32_t p;
573 int32_t tag;
574 int32_t lastP = 0x7ffffffe;
575 int i;
576
577 logln("testPreceding():");
578 bi.setText(td.fDataToBreak);
579 td.clearResults();
580
581 p = bi.last();
582 td.fActualBreakPositions.addElement(p, status);
583 tag = bi.getRuleStatus();
584 td.fActualTags.addElement(tag, status);
585
586 for (i = td.fDataToBreak.length(); i>=-1; i--) {
587 p = bi.preceding(i);
588 if (p != lastP) {
589 if (p == RuleBasedBreakIterator::DONE) {
590 break;
591 }
592 // We've reached a new break position. Save it.
593 td.fActualBreakPositions.insertElementAt(p, 0, status);
594 lastP = p;
595 tag = bi.getRuleStatus();
596 td.fActualTags.insertElementAt(tag, 0, status);
597 }
598 }
599 // The loop normally exits by means of the break in the middle.
600 // Make sure that the index was at the correct position for the break iterator to have
601 // returned DONE.
602 if (i != 0) {
603 errln("testPreceding(): iterator returned DONE prematurely.");
604 }
605
606 // Full check of all results.
607 td.checkResults("testPreceding", this);
608 }
609
610
611
612 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td) {
613 UErrorCode status = U_ZERO_ERROR;
614 int i;
615 int32_t tag;
616
617 logln("testIsBoundary():");
618 bi.setText(td.fDataToBreak);
619 td.clearResults();
620
621 for (i = 0; i <= td.fDataToBreak.length(); i++) {
622 if (bi.isBoundary(i)) {
623 td.fActualBreakPositions.addElement(i, status); // Save result.
624 tag = bi.getRuleStatus();
625 td.fActualTags.addElement(tag, status);
626 }
627 }
628 td.checkResults("testIsBoundary: ", this);
629 }
630
631
632
633 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
634 {
635 iterator.setText(td.fDataToBreak);
636
637 RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
638 int32_t offset = iterator.first();
639 int32_t testOffset;
640 int32_t count = 0;
641
642 logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
643
644 if (*testIterator != iterator)
645 errln("clone() or operator!= failed: two clones compared unequal");
646
647 do {
648 testOffset = testIterator->first();
649 testOffset = testIterator->next(count);
650 if (offset != testOffset)
651 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
652
653 if (offset != RuleBasedBreakIterator::DONE) {
654 count++;
655 offset = iterator.next();
656
657 if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
658 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
659 if (count > 10000 || offset == -1) {
660 errln("operator== failed too many times. Stopping test.");
661 if (offset == -1) {
662 errln("Does (RuleBasedBreakIterator::DONE == -1)?");
663 }
664 return;
665 }
666 }
667 }
668 } while (offset != RuleBasedBreakIterator::DONE);
669
670 // now do it backwards...
671 offset = iterator.last();
672 count = 0;
673
674 do {
675 testOffset = testIterator->last();
676 testOffset = testIterator->next(count); // next() with a negative arg is same as previous
677 if (offset != testOffset)
678 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
679
680 if (offset != RuleBasedBreakIterator::DONE) {
681 count--;
682 offset = iterator.previous();
683 }
684 } while (offset != RuleBasedBreakIterator::DONE);
685
686 delete testIterator;
687 }
688
689
690 //---------------------------------------------
691 //
692 // other tests
693 //
694 //---------------------------------------------
695 void RBBITest::TestEmptyString()
696 {
697 UnicodeString text = "";
698 UErrorCode status = U_ZERO_ERROR;
699
700 BITestData x(status);
701 ADD_DATACHUNK(x, "", 0, status); // Break at start of data
702 RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
703 if (U_FAILURE(status))
704 {
705 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
706 return;
707 }
708 generalIteratorTest(*bi, x);
709 delete bi;
710 }
711
712 void RBBITest::TestGetAvailableLocales()
713 {
714 int32_t locCount = 0;
715 const Locale* locList = BreakIterator::getAvailableLocales(locCount);
716
717 if (locCount == 0)
718 dataerrln("getAvailableLocales() returned an empty list!");
719 // Just make sure that it's returning good memory.
720 int32_t i;
721 for (i = 0; i < locCount; ++i) {
722 logln(locList[i].getName());
723 }
724 }
725
726 //Testing the BreakIterator::getDisplayName() function
727 void RBBITest::TestGetDisplayName()
728 {
729 UnicodeString result;
730
731 BreakIterator::getDisplayName(Locale::getUS(), result);
732 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
733 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
734 + result);
735
736 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
737 if (result != "French (France)")
738 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
739 + result);
740 }
741 /**
742 * Test End Behaviour
743 * @bug 4068137
744 */
745 void RBBITest::TestEndBehaviour()
746 {
747 UErrorCode status = U_ZERO_ERROR;
748 UnicodeString testString("boo.");
749 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
750 if (U_FAILURE(status))
751 {
752 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
753 return;
754 }
755 wb->setText(testString);
756
757 if (wb->first() != 0)
758 errln("Didn't get break at beginning of string.");
759 if (wb->next() != 3)
760 errln("Didn't get break before period in \"boo.\"");
761 if (wb->current() != 4 && wb->next() != 4)
762 errln("Didn't get break at end of string.");
763 delete wb;
764 }
765 /*
766 * @bug 4153072
767 */
768 void RBBITest::TestBug4153072() {
769 UErrorCode status = U_ZERO_ERROR;
770 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
771 if (U_FAILURE(status))
772 {
773 errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
774 return;
775 }
776 UnicodeString str("...Hello, World!...");
777 int32_t begin = 3;
778 int32_t end = str.length() - 3;
779 UBool onBoundary;
780
781 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
782 iter->adoptText(textIterator);
783 int index;
784 // Note: with the switch to UText, there is no way to restrict the
785 // iteration range to begin at an index other than zero.
786 // String character iterators created with a non-zero bound are
787 // treated by RBBI as being empty.
788 for (index = -1; index < begin + 1; ++index) {
789 onBoundary = iter->isBoundary(index);
790 if (index == 0? !onBoundary : onBoundary) {
791 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
792 " and begin index = " + begin);
793 }
794 }
795 delete iter;
796 }
797
798
799 //
800 // Test for problem reported by Ashok Matoria on 9 July 2007
801 // One.<kSoftHyphen><kSpace>Two.
802 //
803 // Sentence break at start (0) and then on calling next() it breaks at
804 // 'T' of "Two". Now, at this point if I do next() and
805 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
806 //
807 void RBBITest::TestBug5775() {
808 UErrorCode status = U_ZERO_ERROR;
809 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
810 TEST_ASSERT_SUCCESS(status);
811 if (U_FAILURE(status)) {
812 return;
813 }
814 // Check for status first for better handling of no data errors.
815 TEST_ASSERT(bi != NULL);
816 if (bi == NULL) {
817 return;
818 }
819
820 UnicodeString s("One.\\u00ad Two.", -1, US_INV);
821 // 01234 56789
822 s = s.unescape();
823 bi->setText(s);
824 int pos = bi->next();
825 TEST_ASSERT(pos == 6);
826 pos = bi->next();
827 TEST_ASSERT(pos == 10);
828 pos = bi->previous();
829 TEST_ASSERT(pos == 6);
830 delete bi;
831 }
832
833
834
835 //------------------------------------------------------------------------------
836 //
837 // RBBITest::Extended Run RBBI Tests from an external test data file
838 //
839 //------------------------------------------------------------------------------
840
841 struct TestParams {
842 BreakIterator *bi; // Break iterator is set while parsing test source.
843 // Changed out whenever test data changes break type.
844
845 UnicodeString dataToBreak; // Data that is built up while parsing the test.
846 UVector32 *expectedBreaks; // Expected break positions, matches dataToBreak UnicodeString.
847 UVector32 *srcLine; // Positions in source file, indexed same as dataToBreak.
848 UVector32 *srcCol;
849
850 UText *textToBreak; // UText, could be UTF8 or UTF16.
851 UVector32 *textMap; // Map from UTF-16 dataToBreak offsets to UText offsets.
852 CharString utf8String; // UTF-8 form of text to break.
853
854 TestParams(UErrorCode &status) : dataToBreak() {
855 bi = NULL;
856 expectedBreaks = new UVector32(status);
857 srcLine = new UVector32(status);
858 srcCol = new UVector32(status);
859 textToBreak = NULL;
860 textMap = new UVector32(status);
861 }
862
863 ~TestParams() {
864 delete bi;
865 delete expectedBreaks;
866 delete srcLine;
867 delete srcCol;
868 utext_close(textToBreak);
869 delete textMap;
870 }
871
872 int32_t getSrcLine(int32_t bp);
873 int32_t getExpectedBreak(int32_t bp);
874 int32_t getSrcCol(int32_t bp);
875
876 void setUTF16(UErrorCode &status);
877 void setUTF8(UErrorCode &status);
878 };
879
880 // Append a UnicodeString to a CharString with UTF-8 encoding.
881 // Substitute any invalid chars.
882 // Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
883 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
884 if (U_FAILURE(status)) {
885 return;
886 }
887 int32_t utf8Length;
888 u_strToUTF8WithSub(NULL, 0, &utf8Length, // Output Buffer, NULL for preflight.
889 src.getBuffer(), src.length(), // UTF-16 data
890 0xfffd, NULL, // Substitution char, number of subs.
891 &status);
892 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
893 return;
894 }
895 status = U_ZERO_ERROR;
896 int32_t capacity;
897 char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
898 u_strToUTF8WithSub(buffer, utf8Length, NULL,
899 src.getBuffer(), src.length(),
900 0xfffd, NULL, &status);
901 dest.append(buffer, utf8Length, status);
902 }
903
904
905 void TestParams::setUTF16(UErrorCode &status) {
906 textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
907 textMap->removeAllElements();
908 for (int32_t i=0; i<dataToBreak.length(); i++) {
909 if (i == dataToBreak.getChar32Start(i)) {
910 textMap->addElement(i, status);
911 } else {
912 textMap->addElement(-1, status);
913 }
914 }
915 textMap->addElement(dataToBreak.length(), status);
916 U_ASSERT(dataToBreak.length() + 1 == textMap->size());
917 }
918
919
920 void TestParams::setUTF8(UErrorCode &status) {
921 if (U_FAILURE(status)) {
922 return;
923 }
924 utf8String.clear();
925 CharStringAppend(utf8String, dataToBreak, status);
926 textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
927 if (U_FAILURE(status)) {
928 return;
929 }
930
931 textMap->removeAllElements();
932 int32_t utf16Index = 0;
933 for (;;) {
934 textMap->addElement(utf16Index, status);
935 UChar32 c32 = utext_current32(textToBreak);
936 if (c32 < 0) {
937 break;
938 }
939 utf16Index += U16_LENGTH(c32);
940 utext_next32(textToBreak);
941 while (textMap->size() < utext_getNativeIndex(textToBreak)) {
942 textMap->addElement(-1, status);
943 }
944 }
945 U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
946 }
947
948
949 int32_t TestParams::getSrcLine(int bp) {
950 if (bp >= textMap->size()) {
951 bp = textMap->size() - 1;
952 }
953 int32_t i = 0;
954 for(; bp >= 0 ; --bp) {
955 // Move to a character boundary if we are not on one already.
956 i = textMap->elementAti(bp);
957 if (i >= 0) {
958 break;
959 }
960 }
961 return srcLine->elementAti(i);
962 }
963
964
965 int32_t TestParams::getExpectedBreak(int bp) {
966 if (bp >= textMap->size()) {
967 return 0;
968 }
969 int32_t i = textMap->elementAti(bp);
970 int32_t retVal = 0;
971 if (i >= 0) {
972 retVal = expectedBreaks->elementAti(i);
973 }
974 return retVal;
975 }
976
977
978 int32_t TestParams::getSrcCol(int bp) {
979 if (bp >= textMap->size()) {
980 bp = textMap->size() - 1;
981 }
982 int32_t i = 0;
983 for(; bp >= 0; --bp) {
984 // Move bp to a character boundary if we are not on one already.
985 i = textMap->elementAti(bp);
986 if (i >= 0) {
987 break;
988 }
989 }
990 return srcCol->elementAti(i);
991 }
992
993
994 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
995 int32_t bp;
996 int32_t prevBP;
997 int32_t i;
998
999 TEST_ASSERT_SUCCESS(status);
1000 if (U_FAILURE(status)) {
1001 return;
1002 }
1003
1004 if (t->bi == NULL) {
1005 return;
1006 }
1007
1008 t->bi->setText(t->textToBreak, status);
1009 //
1010 // Run the iterator forward
1011 //
1012 prevBP = -1;
1013 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
1014 if (prevBP == bp) {
1015 // Fail for lack of forward progress.
1016 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
1017 bp, t->getSrcLine(bp), t->getSrcCol(bp));
1018 break;
1019 }
1020
1021 // Check that there we didn't miss an expected break between the last one
1022 // and this one.
1023 for (i=prevBP+1; i<bp; i++) {
1024 if (t->getExpectedBreak(i) != 0) {
1025 int expected[] = {0, i};
1026 printStringBreaks(t->dataToBreak, expected, 2);
1027 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1028 i, t->getSrcLine(i), t->getSrcCol(i));
1029 }
1030 }
1031
1032 // Check that the break we did find was expected
1033 if (t->getExpectedBreak(bp) == 0) {
1034 int expected[] = {0, bp};
1035 printStringBreaks(t->textToBreak, expected, 2);
1036 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
1037 bp, t->getSrcLine(bp), t->getSrcCol(bp));
1038 } else {
1039 // The break was expected.
1040 // Check that the {nnn} tag value is correct.
1041 int32_t expectedTagVal = t->getExpectedBreak(bp);
1042 if (expectedTagVal == -1) {
1043 expectedTagVal = 0;
1044 }
1045 int32_t line = t->getSrcLine(bp);
1046 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
1047 if (rs != expectedTagVal) {
1048 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
1049 " Actual, Expected status = %4d, %4d",
1050 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
1051 }
1052 }
1053
1054 prevBP = bp;
1055 }
1056
1057 // Verify that there were no missed expected breaks after the last one found
1058 for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
1059 if (t->getExpectedBreak(i) != 0) {
1060 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1061 i, t->getSrcLine(i), t->getSrcCol(i));
1062 }
1063 }
1064
1065 //
1066 // Run the iterator backwards, verify that the same breaks are found.
1067 //
1068 prevBP = utext_nativeLength(t->textToBreak)+2; // start with a phony value for the last break pos seen.
1069 for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
1070 if (prevBP == bp) {
1071 // Fail for lack of progress.
1072 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
1073 bp, t->getSrcLine(bp), t->getSrcCol(bp));
1074 break;
1075 }
1076
1077 // Check that we didn't miss an expected break between the last one
1078 // and this one. (UVector returns zeros for index out of bounds.)
1079 for (i=prevBP-1; i>bp; i--) {
1080 if (t->getExpectedBreak(i) != 0) {
1081 errln("Reverse Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1082 i, t->getSrcLine(i), t->getSrcCol(i));
1083 }
1084 }
1085
1086 // Check that the break we did find was expected
1087 if (t->getExpectedBreak(bp) == 0) {
1088 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
1089 bp, t->getSrcLine(bp), t->getSrcCol(bp));
1090 } else {
1091 // The break was expected.
1092 // Check that the {nnn} tag value is correct.
1093 int32_t expectedTagVal = t->getExpectedBreak(bp);
1094 if (expectedTagVal == -1) {
1095 expectedTagVal = 0;
1096 }
1097 int line = t->getSrcLine(bp);
1098 int32_t rs = t->bi->getRuleStatus();
1099 if (rs != expectedTagVal) {
1100 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
1101 " Actual, Expected status = %4d, %4d",
1102 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
1103 }
1104 }
1105
1106 prevBP = bp;
1107 }
1108
1109 // Verify that there were no missed breaks prior to the last one found
1110 for (i=prevBP-1; i>=0; i--) {
1111 if (t->getExpectedBreak(i) != 0) {
1112 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1113 i, t->getSrcLine(i), t->getSrcCol(i));
1114 }
1115 }
1116
1117 // Check isBoundary()
1118 for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
1119 UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
1120 UBool boundaryFound = t->bi->isBoundary(i);
1121 if (boundaryExpected != boundaryFound) {
1122 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
1123 " Expected, Actual= %s, %s",
1124 i, t->getSrcLine(i), t->getSrcCol(i),
1125 boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
1126 }
1127 }
1128
1129 // Check following()
1130 for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
1131 int32_t actualBreak = t->bi->following(i);
1132 int32_t expectedBreak = BreakIterator::DONE;
1133 for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) {
1134 if (t->getExpectedBreak(j) != 0) {
1135 expectedBreak = j;
1136 break;
1137 }
1138 }
1139 if (expectedBreak != actualBreak) {
1140 errln("following(%d) incorrect. File line,col= %4d,%4d\n"
1141 " Expected, Actual= %d, %d",
1142 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
1143 }
1144 }
1145
1146 // Check preceding()
1147 for (i=utext_nativeLength(t->textToBreak); i>=0; i--) {
1148 int32_t actualBreak = t->bi->preceding(i);
1149 int32_t expectedBreak = BreakIterator::DONE;
1150
1151 // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
1152 // preceding(trailing byte) will return the index of some preceding code point,
1153 // not the lead byte of the current code point, even though that has a smaller index.
1154 // Therefore, start looking at the expected break data not at i-1, but at
1155 // the start of code point index - 1.
1156 utext_setNativeIndex(t->textToBreak, i);
1157 int32_t j = utext_getNativeIndex(t->textToBreak) - 1;
1158 for (; j >= 0; j--) {
1159 if (t->getExpectedBreak(j) != 0) {
1160 expectedBreak = j;
1161 break;
1162 }
1163 }
1164 if (expectedBreak != actualBreak) {
1165 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
1166 " Expected, Actual= %d, %d",
1167 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
1168 }
1169 }
1170 }
1171
1172
1173 void RBBITest::TestExtended() {
1174 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1175 UErrorCode status = U_ZERO_ERROR;
1176 Locale locale("");
1177
1178 UnicodeString rules;
1179 TestParams tp(status);
1180
1181 RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
1182 if (U_FAILURE(status)) {
1183 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
1184 }
1185
1186
1187 //
1188 // Open and read the test data file.
1189 //
1190 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1191 char testFileName[1000];
1192 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1193 errln("Can't open test data. Path too long.");
1194 return;
1195 }
1196 strcpy(testFileName, testDataDirectory);
1197 strcat(testFileName, "rbbitst.txt");
1198
1199 int len;
1200 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1201 if (U_FAILURE(status)) {
1202 return; /* something went wrong, error already output */
1203 }
1204
1205
1206 bool skipTest = false; // Skip this test?
1207
1208 //
1209 // Put the test data into a UnicodeString
1210 //
1211 UnicodeString testString(FALSE, testFile, len);
1212
1213 enum EParseState{
1214 PARSE_COMMENT,
1215 PARSE_TAG,
1216 PARSE_DATA,
1217 PARSE_NUM
1218 }
1219 parseState = PARSE_TAG;
1220
1221 EParseState savedState = PARSE_TAG;
1222
1223 static const UChar CH_LF = 0x0a;
1224 static const UChar CH_CR = 0x0d;
1225 static const UChar CH_HASH = 0x23;
1226 /*static const UChar CH_PERIOD = 0x2e;*/
1227 static const UChar CH_LT = 0x3c;
1228 static const UChar CH_GT = 0x3e;
1229 static const UChar CH_BACKSLASH = 0x5c;
1230 static const UChar CH_BULLET = 0x2022;
1231
1232 int32_t lineNum = 1;
1233 int32_t colStart = 0;
1234 int32_t column = 0;
1235 int32_t charIdx = 0;
1236
1237 int32_t tagValue = 0; // The numeric value of a <nnn> tag.
1238
1239 for (charIdx = 0; charIdx < len; ) {
1240 status = U_ZERO_ERROR;
1241 UChar c = testString.charAt(charIdx);
1242 charIdx++;
1243 if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
1244 // treat CRLF as a unit
1245 c = CH_LF;
1246 charIdx++;
1247 }
1248 if (c == CH_LF || c == CH_CR) {
1249 lineNum++;
1250 colStart = charIdx;
1251 }
1252 column = charIdx - colStart + 1;
1253
1254 switch (parseState) {
1255 case PARSE_COMMENT:
1256 if (c == 0x0a || c == 0x0d) {
1257 parseState = savedState;
1258 }
1259 break;
1260
1261 case PARSE_TAG:
1262 {
1263 if (c == CH_HASH) {
1264 parseState = PARSE_COMMENT;
1265 savedState = PARSE_TAG;
1266 break;
1267 }
1268 if (u_isUWhiteSpace(c)) {
1269 break;
1270 }
1271 if (testString.compare(charIdx-1, 6, "<word>") == 0) {
1272 delete tp.bi;
1273 tp.bi = BreakIterator::createWordInstance(locale, status);
1274 skipTest = false;
1275 charIdx += 5;
1276 break;
1277 }
1278 if (testString.compare(charIdx-1, 6, "<char>") == 0) {
1279 delete tp.bi;
1280 tp.bi = BreakIterator::createCharacterInstance(locale, status);
1281 skipTest = false;
1282 charIdx += 5;
1283 break;
1284 }
1285 if (testString.compare(charIdx-1, 6, "<line>") == 0) {
1286 delete tp.bi;
1287 tp.bi = BreakIterator::createLineInstance(locale, status);
1288 skipTest = false;
1289 charIdx += 5;
1290 break;
1291 }
1292 if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
1293 delete tp.bi;
1294 tp.bi = BreakIterator::createSentenceInstance(locale, status);
1295 skipTest = false;
1296 charIdx += 5;
1297 break;
1298 }
1299 if (testString.compare(charIdx-1, 7, "<title>") == 0) {
1300 delete tp.bi;
1301 tp.bi = BreakIterator::createTitleInstance(locale, status);
1302 charIdx += 6;
1303 break;
1304 }
1305
1306 // <locale loc_name>
1307 localeMatcher.reset(testString);
1308 if (localeMatcher.lookingAt(charIdx-1, status)) {
1309 UnicodeString localeName = localeMatcher.group(1, status);
1310 char localeName8[100];
1311 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
1312 locale = Locale::createFromName(localeName8);
1313 charIdx += localeMatcher.group(0, status).length() - 1;
1314 TEST_ASSERT_SUCCESS(status);
1315 break;
1316 }
1317 if (testString.compare(charIdx-1, 6, "<data>") == 0) {
1318 parseState = PARSE_DATA;
1319 charIdx += 5;
1320 tp.dataToBreak = "";
1321 tp.expectedBreaks->removeAllElements();
1322 tp.srcCol ->removeAllElements();
1323 tp.srcLine->removeAllElements();
1324 break;
1325 }
1326
1327 errln("line %d: Tag expected in test file.", lineNum);
1328 parseState = PARSE_COMMENT;
1329 savedState = PARSE_DATA;
1330 goto end_test; // Stop the test.
1331 }
1332 break;
1333
1334 case PARSE_DATA:
1335 if (c == CH_BULLET) {
1336 int32_t breakIdx = tp.dataToBreak.length();
1337 tp.expectedBreaks->setSize(breakIdx+1);
1338 tp.expectedBreaks->setElementAt(-1, breakIdx);
1339 tp.srcLine->setSize(breakIdx+1);
1340 tp.srcLine->setElementAt(lineNum, breakIdx);
1341 tp.srcCol ->setSize(breakIdx+1);
1342 tp.srcCol ->setElementAt(column, breakIdx);
1343 break;
1344 }
1345
1346 if (testString.compare(charIdx-1, 7, "</data>") == 0) {
1347 // Add final entry to mappings from break location to source file position.
1348 // Need one extra because last break position returned is after the
1349 // last char in the data, not at the last char.
1350 tp.srcLine->addElement(lineNum, status);
1351 tp.srcCol ->addElement(column, status);
1352
1353 parseState = PARSE_TAG;
1354 charIdx += 6;
1355
1356 if (!skipTest) {
1357 // RUN THE TEST!
1358 status = U_ZERO_ERROR;
1359 tp.setUTF16(status);
1360 executeTest(&tp, status);
1361 TEST_ASSERT_SUCCESS(status);
1362
1363 // Run again, this time with UTF-8 text wrapped in a UText.
1364 status = U_ZERO_ERROR;
1365 tp.setUTF8(status);
1366 TEST_ASSERT_SUCCESS(status);
1367 executeTest(&tp, status);
1368 }
1369 break;
1370 }
1371
1372 if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
1373 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1374 // Get the code point from the name and insert it into the test data.
1375 // (Damn, no API takes names in Unicode !!!
1376 // we've got to take it back to char *)
1377 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
1378 int32_t nameLength = nameEndIdx - (charIdx+2);
1379 char charNameBuf[200];
1380 UChar32 theChar = -1;
1381 if (nameEndIdx != -1) {
1382 UErrorCode status = U_ZERO_ERROR;
1383 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1384 charNameBuf[sizeof(charNameBuf)-1] = 0;
1385 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1386 if (U_FAILURE(status)) {
1387 theChar = -1;
1388 }
1389 }
1390 if (theChar == -1) {
1391 errln("Error in named character in test file at line %d, col %d",
1392 lineNum, column);
1393 } else {
1394 // Named code point was recognized. Insert it
1395 // into the test data.
1396 tp.dataToBreak.append(theChar);
1397 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1398 tp.srcLine->addElement(lineNum, status);
1399 tp.srcCol ->addElement(column, status);
1400 }
1401 }
1402 if (nameEndIdx > charIdx) {
1403 charIdx = nameEndIdx+1;
1404
1405 }
1406 break;
1407 }
1408
1409
1410
1411
1412 if (testString.compare(charIdx-1, 2, "<>") == 0) {
1413 charIdx++;
1414 int32_t breakIdx = tp.dataToBreak.length();
1415 tp.expectedBreaks->setSize(breakIdx+1);
1416 tp.expectedBreaks->setElementAt(-1, breakIdx);
1417 tp.srcLine->setSize(breakIdx+1);
1418 tp.srcLine->setElementAt(lineNum, breakIdx);
1419 tp.srcCol ->setSize(breakIdx+1);
1420 tp.srcCol ->setElementAt(column, breakIdx);
1421 break;
1422 }
1423
1424 if (c == CH_LT) {
1425 tagValue = 0;
1426 parseState = PARSE_NUM;
1427 break;
1428 }
1429
1430 if (c == CH_HASH && column==3) { // TODO: why is column off so far?
1431 parseState = PARSE_COMMENT;
1432 savedState = PARSE_DATA;
1433 break;
1434 }
1435
1436 if (c == CH_BACKSLASH) {
1437 // Check for \ at end of line, a line continuation.
1438 // Advance over (discard) the newline
1439 UChar32 cp = testString.char32At(charIdx);
1440 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
1441 // We have a CR LF
1442 // Need an extra increment of the input ptr to move over both of them
1443 charIdx++;
1444 }
1445 if (cp == CH_LF || cp == CH_CR) {
1446 lineNum++;
1447 colStart = charIdx;
1448 charIdx++;
1449 break;
1450 }
1451
1452 // Let unescape handle the back slash.
1453 cp = testString.unescapeAt(charIdx);
1454 if (cp != -1) {
1455 // Escape sequence was recognized. Insert the char
1456 // into the test data.
1457 tp.dataToBreak.append(cp);
1458 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1459 tp.srcLine->addElement(lineNum, status);
1460 tp.srcCol ->addElement(column, status);
1461 }
1462 break;
1463 }
1464
1465
1466 // Not a recognized backslash escape sequence.
1467 // Take the next char as a literal.
1468 // TODO: Should this be an error?
1469 c = testString.charAt(charIdx);
1470 charIdx = testString.moveIndex32(charIdx, 1);
1471 }
1472
1473 // Normal, non-escaped data char.
1474 tp.dataToBreak.append(c);
1475
1476 // Save the mapping from offset in the data to line/column numbers in
1477 // the original input file. Will be used for better error messages only.
1478 // If there's an expected break before this char, the slot in the mapping
1479 // vector will already be set for this char; don't overwrite it.
1480 if (tp.dataToBreak.length() > tp.srcLine->size()) {
1481 tp.srcLine->addElement(lineNum, status);
1482 tp.srcCol ->addElement(column, status);
1483 }
1484 break;
1485
1486
1487 case PARSE_NUM:
1488 // We are parsing an expected numeric tag value, like <1234>,
1489 // within a chunk of data.
1490 if (u_isUWhiteSpace(c)) {
1491 break;
1492 }
1493
1494 if (c == CH_GT) {
1495 // Finished the number. Add the info to the expected break data,
1496 // and switch parse state back to doing plain data.
1497 parseState = PARSE_DATA;
1498 if (tagValue == 0) {
1499 tagValue = -1;
1500 }
1501 int32_t breakIdx = tp.dataToBreak.length();
1502 tp.expectedBreaks->setSize(breakIdx+1);
1503 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1504 tp.srcLine->setSize(breakIdx+1);
1505 tp.srcLine->setElementAt(lineNum, breakIdx);
1506 tp.srcCol ->setSize(breakIdx+1);
1507 tp.srcCol ->setElementAt(column, breakIdx);
1508 break;
1509 }
1510
1511 if (u_isdigit(c)) {
1512 tagValue = tagValue*10 + u_charDigitValue(c);
1513 break;
1514 }
1515
1516 errln("Syntax Error in test file at line %d, col %d",
1517 lineNum, column);
1518 parseState = PARSE_COMMENT;
1519 goto end_test; // Stop the test
1520 break;
1521 }
1522
1523
1524 if (U_FAILURE(status)) {
1525 dataerrln("ICU Error %s while parsing test file at line %d.",
1526 u_errorName(status), lineNum);
1527 status = U_ZERO_ERROR;
1528 goto end_test; // Stop the test
1529 }
1530
1531 }
1532
1533 end_test:
1534 delete [] testFile;
1535 #endif
1536 }
1537
1538
1539 //-------------------------------------------------------------------------------
1540 //
1541 // TestDictRules create a break iterator from source rules that includes a
1542 // dictionary range. Regression for bug #7130. Source rules
1543 // do not declare a break iterator type (word, line, sentence, etc.
1544 // but the dictionary code, without a type, would loop.
1545 //
1546 //-------------------------------------------------------------------------------
1547 void RBBITest::TestDictRules() {
1548 const char *rules = "$dictionary = [a-z]; \n"
1549 "!!forward; \n"
1550 "$dictionary $dictionary; \n"
1551 "!!reverse; \n"
1552 "$dictionary $dictionary; \n";
1553 const char *text = "aa";
1554 UErrorCode status = U_ZERO_ERROR;
1555 UParseError parseError;
1556
1557 RuleBasedBreakIterator bi(rules, parseError, status);
1558 if (U_SUCCESS(status)) {
1559 UnicodeString utext = text;
1560 bi.setText(utext);
1561 int32_t position;
1562 int32_t loops;
1563 for (loops = 0; loops<10; loops++) {
1564 position = bi.next();
1565 if (position == RuleBasedBreakIterator::DONE) {
1566 break;
1567 }
1568 }
1569 TEST_ASSERT(loops == 1);
1570 } else {
1571 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1572 }
1573 }
1574
1575
1576
1577 //-------------------------------------------------------------------------------
1578 //
1579 // ReadAndConvertFile Read a text data file, convert it to UChars, and
1580 // return the data in one big UChar * buffer, which the caller must delete.
1581 //
1582 // parameters:
1583 // fileName: the name of the file, with no directory part. The test data directory
1584 // is assumed.
1585 // ulen an out parameter, receives the actual length (in UChars) of the file data.
1586 // encoding The file encoding. If the file contains a BOM, that will override the encoding
1587 // specified here. The BOM, if it exists, will be stripped from the returned data.
1588 // Pass NULL for the system default encoding.
1589 // status
1590 // returns:
1591 // The file data, converted to UChar.
1592 // The caller must delete this when done with
1593 // delete [] theBuffer;
1594 //
1595 // TODO: This is a clone of RegexTest::ReadAndConvertFile.
1596 // Move this function to some common place.
1597 //
1598 //--------------------------------------------------------------------------------
1599 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
1600 UChar *retPtr = NULL;
1601 char *fileBuf = NULL;
1602 UConverter* conv = NULL;
1603 FILE *f = NULL;
1604
1605 ulen = 0;
1606 if (U_FAILURE(status)) {
1607 return retPtr;
1608 }
1609
1610 //
1611 // Open the file.
1612 //
1613 f = fopen(fileName, "rb");
1614 if (f == 0) {
1615 dataerrln("Error opening test data file %s\n", fileName);
1616 status = U_FILE_ACCESS_ERROR;
1617 return NULL;
1618 }
1619 //
1620 // Read it in
1621 //
1622 int fileSize;
1623 int amt_read;
1624
1625 fseek( f, 0, SEEK_END);
1626 fileSize = ftell(f);
1627 fileBuf = new char[fileSize];
1628 fseek(f, 0, SEEK_SET);
1629 amt_read = fread(fileBuf, 1, fileSize, f);
1630 if (amt_read != fileSize || fileSize <= 0) {
1631 errln("Error reading test data file.");
1632 goto cleanUpAndReturn;
1633 }
1634
1635 //
1636 // Look for a Unicode Signature (BOM) on the data just read
1637 //
1638 int32_t signatureLength;
1639 const char * fileBufC;
1640 const char* bomEncoding;
1641
1642 fileBufC = fileBuf;
1643 bomEncoding = ucnv_detectUnicodeSignature(
1644 fileBuf, fileSize, &signatureLength, &status);
1645 if(bomEncoding!=NULL ){
1646 fileBufC += signatureLength;
1647 fileSize -= signatureLength;
1648 encoding = bomEncoding;
1649 }
1650
1651 //
1652 // Open a converter to take the rule file to UTF-16
1653 //
1654 conv = ucnv_open(encoding, &status);
1655 if (U_FAILURE(status)) {
1656 goto cleanUpAndReturn;
1657 }
1658
1659 //
1660 // Convert the rules to UChar.
1661 // Preflight first to determine required buffer size.
1662 //
1663 ulen = ucnv_toUChars(conv,
1664 NULL, // dest,
1665 0, // destCapacity,
1666 fileBufC,
1667 fileSize,
1668 &status);
1669 if (status == U_BUFFER_OVERFLOW_ERROR) {
1670 // Buffer Overflow is expected from the preflight operation.
1671 status = U_ZERO_ERROR;
1672
1673 retPtr = new UChar[ulen+1];
1674 ucnv_toUChars(conv,
1675 retPtr, // dest,
1676 ulen+1,
1677 fileBufC,
1678 fileSize,
1679 &status);
1680 }
1681
1682 cleanUpAndReturn:
1683 fclose(f);
1684 delete []fileBuf;
1685 ucnv_close(conv);
1686 if (U_FAILURE(status)) {
1687 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1688 delete []retPtr;
1689 retPtr = 0;
1690 ulen = 0;
1691 };
1692 return retPtr;
1693 }
1694
1695
1696
1697 //--------------------------------------------------------------------------------------------
1698 //
1699 // Run tests from each of the boundary test data files distributed by the Unicode Consortium
1700 //
1701 //-------------------------------------------------------------------------------------------
1702 void RBBITest::TestUnicodeFiles() {
1703 RuleBasedBreakIterator *bi;
1704 UErrorCode status = U_ZERO_ERROR;
1705
1706 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1707 TEST_ASSERT_SUCCESS(status);
1708 if (U_SUCCESS(status)) {
1709 runUnicodeTestData("GraphemeBreakTest.txt", bi);
1710 }
1711 delete bi;
1712
1713 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1714 TEST_ASSERT_SUCCESS(status);
1715 if (U_SUCCESS(status)) {
1716 runUnicodeTestData("WordBreakTest.txt", bi);
1717 }
1718 delete bi;
1719
1720 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1721 TEST_ASSERT_SUCCESS(status);
1722 if (U_SUCCESS(status)) {
1723 runUnicodeTestData("SentenceBreakTest.txt", bi);
1724 }
1725 delete bi;
1726
1727 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1728 TEST_ASSERT_SUCCESS(status);
1729 if (U_SUCCESS(status)) {
1730 runUnicodeTestData("LineBreakTest.txt", bi);
1731 }
1732 delete bi;
1733 }
1734
1735
1736 // Check for test cases from the Unicode test data files that are known to fail
1737 // and should be skipped because ICU is not yet able to fully implement the spec.
1738 // See ticket #7270.
1739
1740 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
1741 static const UChar badTestCases[][4] = { // Line Numbers from Unicode 7.0.0 file.
1742 {(UChar)0x200B, (UChar)0x0020, (UChar)0x007D, (UChar)0x0000}, // Line 5198
1743 {(UChar)0x200B, (UChar)0x0020, (UChar)0x0029, (UChar)0x0000}, // Line 5202
1744 {(UChar)0x200B, (UChar)0x0020, (UChar)0x0021, (UChar)0x0000}, // Line 5214
1745 {(UChar)0x200B, (UChar)0x0020, (UChar)0x002c, (UChar)0x0000}, // Line 5246
1746 {(UChar)0x200B, (UChar)0x0020, (UChar)0x002f, (UChar)0x0000}, // Line 5298
1747 {(UChar)0x200B, (UChar)0x0020, (UChar)0x2060, (UChar)0x0000} // Line 5302
1748 };
1749 if (strcmp(fileName, "LineBreakTest.txt") != 0) {
1750 return FALSE;
1751 }
1752
1753 for (int i=0; i<UPRV_LENGTHOF(badTestCases); i++) {
1754 if (testCase == UnicodeString(badTestCases[i])) {
1755 return logKnownIssue("7270");
1756 }
1757 }
1758 return FALSE;
1759 }
1760
1761
1762 //--------------------------------------------------------------------------------------------
1763 //
1764 // Run tests from one of the boundary test data files distributed by the Unicode Consortium
1765 //
1766 //-------------------------------------------------------------------------------------------
1767 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1768 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1769 UErrorCode status = U_ZERO_ERROR;
1770
1771 //
1772 // Open and read the test data file, put it into a UnicodeString.
1773 //
1774 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1775 char testFileName[1000];
1776 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1777 dataerrln("Can't open test data. Path too long.");
1778 return;
1779 }
1780 strcpy(testFileName, testDataDirectory);
1781 strcat(testFileName, fileName);
1782
1783 logln("Opening data file %s\n", fileName);
1784
1785 int len;
1786 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1787 if (status != U_FILE_ACCESS_ERROR) {
1788 TEST_ASSERT_SUCCESS(status);
1789 TEST_ASSERT(testFile != NULL);
1790 }
1791 if (U_FAILURE(status) || testFile == NULL) {
1792 return; /* something went wrong, error already output */
1793 }
1794 UnicodeString testFileAsString(TRUE, testFile, len);
1795
1796 //
1797 // Parse the test data file using a regular expression.
1798 // Each kind of token is recognized in its own capture group; what type of item was scanned
1799 // is identified by which group had a match.
1800 //
1801 // Caputure Group # 1 2 3 4 5
1802 // Parses this item: divide x hex digits comment \n unrecognized \n
1803 //
1804 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1805 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1806 UnicodeString testString;
1807 UVector32 breakPositions(status);
1808 int lineNumber = 1;
1809 TEST_ASSERT_SUCCESS(status);
1810 if (U_FAILURE(status)) {
1811 return;
1812 }
1813
1814 //
1815 // Scan through each test case, building up the string to be broken in testString,
1816 // and the positions that should be boundaries in the breakPositions vector.
1817 //
1818 int spin = 0;
1819 while (tokenMatcher.find()) {
1820 if(tokenMatcher.hitEnd()) {
1821 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for.
1822 This occurred when the text file was corrupt (wasn't marked as UTF-8)
1823 and caused an infinite loop here on EBCDIC systems!
1824 */
1825 fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1826 // return;
1827 }
1828 if (tokenMatcher.start(1, status) >= 0) {
1829 // Scanned a divide sign, indicating a break position in the test data.
1830 if (testString.length()>0) {
1831 breakPositions.addElement(testString.length(), status);
1832 }
1833 }
1834 else if (tokenMatcher.start(2, status) >= 0) {
1835 // Scanned an 'x', meaning no break at this position in the test data
1836 // Nothing to be done here.
1837 }
1838 else if (tokenMatcher.start(3, status) >= 0) {
1839 // Scanned Hex digits. Convert them to binary, append to the character data string.
1840 const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1841 int length = hexNumber.length();
1842 if (length<=8) {
1843 char buf[10];
1844 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1845 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1846 if (c<=0x10ffff) {
1847 testString.append(c);
1848 } else {
1849 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1850 fileName, lineNumber);
1851 }
1852 } else {
1853 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1854 fileName, lineNumber);
1855 }
1856 }
1857 else if (tokenMatcher.start(4, status) >= 0) {
1858 // Scanned to end of a line, possibly skipping over a comment in the process.
1859 // If the line from the file contained test data, run the test now.
1860 if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
1861 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1862 }
1863
1864 // Clear out this test case.
1865 // The string and breakPositions vector will be refilled as the next
1866 // test case is parsed.
1867 testString.remove();
1868 breakPositions.removeAllElements();
1869 lineNumber++;
1870 } else {
1871 // Scanner catchall. Something unrecognized appeared on the line.
1872 char token[16];
1873 UnicodeString uToken = tokenMatcher.group(0, status);
1874 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1875 token[sizeof(token)-1] = 0;
1876 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token);
1877
1878 // Clean up, in preparation for continuing with the next line.
1879 testString.remove();
1880 breakPositions.removeAllElements();
1881 lineNumber++;
1882 }
1883 TEST_ASSERT_SUCCESS(status);
1884 if (U_FAILURE(status)) {
1885 break;
1886 }
1887 }
1888
1889 delete [] testFile;
1890 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
1891 }
1892
1893 //--------------------------------------------------------------------------------------------
1894 //
1895 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium
1896 // test data files. Do only a simple, forward-only check -
1897 // this test is mostly to check that ICU and the Unicode
1898 // data agree with each other.
1899 //
1900 //--------------------------------------------------------------------------------------------
1901 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1902 const UnicodeString &testString, // Text data to be broken
1903 UVector32 *breakPositions, // Positions where breaks should be found.
1904 RuleBasedBreakIterator *bi) {
1905 int32_t pos; // Break Position in the test string
1906 int32_t expectedI = 0; // Index of expected break position in the vector of expected results.
1907 int32_t expectedPos; // Expected break position (index into test string)
1908
1909 bi->setText(testString);
1910 pos = bi->first();
1911 pos = bi->next();
1912
1913 while (pos != BreakIterator::DONE) {
1914 if (expectedI >= breakPositions->size()) {
1915 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1916 testFileName, lineNumber, pos);
1917 break;
1918 }
1919 expectedPos = breakPositions->elementAti(expectedI);
1920 if (pos < expectedPos) {
1921 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1922 testFileName, lineNumber, pos);
1923 break;
1924 }
1925 if (pos > expectedPos) {
1926 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1927 testFileName, lineNumber, expectedPos);
1928 break;
1929 }
1930 pos = bi->next();
1931 expectedI++;
1932 }
1933
1934 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1935 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1936 testFileName, lineNumber, breakPositions->elementAti(expectedI));
1937 }
1938 }
1939
1940
1941
1942 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1943 //---------------------------------------------------------------------------------------
1944 //
1945 // classs RBBIMonkeyKind
1946 //
1947 // Monkey Test for Break Iteration
1948 // Abstract interface class. Concrete derived classes independently
1949 // implement the break rules for different iterator types.
1950 //
1951 // The Monkey Test itself uses doesn't know which type of break iterator it is
1952 // testing, but works purely in terms of the interface defined here.
1953 //
1954 //---------------------------------------------------------------------------------------
1955 class RBBIMonkeyKind {
1956 public:
1957 // Return a UVector of UnicodeSets, representing the character classes used
1958 // for this type of iterator.
1959 virtual UVector *charClasses() = 0;
1960
1961 // Set the test text on which subsequent calls to next() will operate
1962 virtual void setText(const UnicodeString &s) = 0;
1963
1964 // Find the next break postion, starting from the prev break position, or from zero.
1965 // Return -1 after reaching end of string.
1966 virtual int32_t next(int32_t i) = 0;
1967
1968 virtual ~RBBIMonkeyKind();
1969 UErrorCode deferredStatus;
1970
1971
1972 protected:
1973 RBBIMonkeyKind();
1974
1975 private:
1976 };
1977
1978 RBBIMonkeyKind::RBBIMonkeyKind() {
1979 deferredStatus = U_ZERO_ERROR;
1980 }
1981
1982 RBBIMonkeyKind::~RBBIMonkeyKind() {
1983 }
1984
1985
1986 //----------------------------------------------------------------------------------------
1987 //
1988 // Random Numbers. Similar to standard lib rand() and srand()
1989 // Not using library to
1990 // 1. Get same results on all platforms.
1991 // 2. Get access to current seed, to more easily reproduce failures.
1992 //
1993 //---------------------------------------------------------------------------------------
1994 static uint32_t m_seed = 1;
1995
1996 static uint32_t m_rand()
1997 {
1998 m_seed = m_seed * 1103515245 + 12345;
1999 return (uint32_t)(m_seed/65536) % 32768;
2000 }
2001
2002
2003 //------------------------------------------------------------------------------------------
2004 //
2005 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation
2006 // of RBBIMonkeyKind.
2007 //
2008 //------------------------------------------------------------------------------------------
2009 class RBBICharMonkey: public RBBIMonkeyKind {
2010 public:
2011 RBBICharMonkey();
2012 virtual ~RBBICharMonkey();
2013 virtual UVector *charClasses();
2014 virtual void setText(const UnicodeString &s);
2015 virtual int32_t next(int32_t i);
2016 private:
2017 UVector *fSets;
2018
2019 UnicodeSet *fCRLFSet;
2020 UnicodeSet *fControlSet;
2021 UnicodeSet *fExtendSet;
2022 UnicodeSet *fRegionalIndicatorSet;
2023 UnicodeSet *fPrependSet;
2024 UnicodeSet *fSpacingSet;
2025 UnicodeSet *fLSet;
2026 UnicodeSet *fVSet;
2027 UnicodeSet *fTSet;
2028 UnicodeSet *fLVSet;
2029 UnicodeSet *fLVTSet;
2030 UnicodeSet *fHangulSet;
2031 UnicodeSet *fAnySet;
2032 UnicodeSet *fEmojiModifierSet;
2033 UnicodeSet *fEmojiBaseSet;
2034 UnicodeSet *fZWJSet;
2035 UnicodeSet *fGAZSet;
2036
2037 const UnicodeString *fText;
2038 };
2039
2040
2041 RBBICharMonkey::RBBICharMonkey() {
2042 UErrorCode status = U_ZERO_ERROR;
2043
2044 fText = NULL;
2045
2046 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
2047 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]-[:Block=Tags:]]"), status);
2048 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}][:Block=Tags:]]"), status);
2049 fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
2050 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
2051 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
2052 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
2053 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
2054 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
2055 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
2056 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
2057 fHangulSet = new UnicodeSet();
2058 fHangulSet->addAll(*fLSet);
2059 fHangulSet->addAll(*fVSet);
2060 fHangulSet->addAll(*fTSet);
2061 fHangulSet->addAll(*fLVSet);
2062 fHangulSet->addAll(*fLVTSet);
2063 fAnySet = new UnicodeSet(0, 0x10ffff);
2064
2065
2066
2067 fEmojiBaseSet = new UnicodeSet(UnicodeString(
2068 "[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C3-\\U0001F3C4\\U0001F3CA-\\U0001F3CB\\U0001F442-\\U0001F443"
2069 "\\U0001F446-\\U0001F450\\U0001F466-\\U0001F469\\U0001F46E\\U0001F470-\\U0001F478\\U0001F47C\\U0001F481-\\U0001F483"
2070 "\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F575\\U0001F590\\U0001F595-\\U0001F596\\U0001F645-\\U0001F647"
2071 "\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F918]"), status);
2072
2073 fEmojiModifierSet = new UnicodeSet(0x0001F3FB, 0x0001F3FF);
2074 fZWJSet = new UnicodeSet(0x200D, 0x200D);
2075 fGAZSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u2640\\u2642\\u2764\\U0001F308\\U0001F466-\\U0001F469\\U0001F48B\\U0001F5E8]"), status);
2076
2077 fSets = new UVector(status);
2078 fSets->addElement(fCRLFSet, status);
2079 fSets->addElement(fControlSet, status);
2080 fSets->addElement(fExtendSet, status);
2081 fSets->addElement(fRegionalIndicatorSet, status);
2082 if (!fPrependSet->isEmpty()) {
2083 fSets->addElement(fPrependSet, status);
2084 }
2085 fSets->addElement(fSpacingSet, status);
2086 fSets->addElement(fHangulSet, status);
2087 fSets->addElement(fAnySet, status);
2088 fSets->addElement(fEmojiBaseSet, status);
2089 fSets->addElement(fEmojiModifierSet, status);
2090 fSets->addElement(fZWJSet, status);
2091 fSets->addElement(fGAZSet, status);
2092 if (U_FAILURE(status)) {
2093 deferredStatus = status;
2094 }
2095 }
2096
2097
2098 void RBBICharMonkey::setText(const UnicodeString &s) {
2099 fText = &s;
2100 }
2101
2102
2103
2104 int32_t RBBICharMonkey::next(int32_t prevPos) {
2105 int p0, p1, p2, p3; // Indices of the significant code points around the
2106 // break position being tested. The candidate break
2107 // location is before p2.
2108
2109 int breakPos = -1;
2110
2111 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2112 UChar32 cBase; // for (X Extend*) patterns, the X character.
2113
2114 if (U_FAILURE(deferredStatus)) {
2115 return -1;
2116 }
2117
2118 // Previous break at end of string. return DONE.
2119 if (prevPos >= fText->length()) {
2120 return -1;
2121 }
2122 p0 = p1 = p2 = p3 = prevPos;
2123 c3 = fText->char32At(prevPos);
2124 c0 = c1 = c2 = cBase = 0;
2125 (void)p0; // suppress set but not used warning.
2126 (void)c0;
2127
2128 // Loop runs once per "significant" character position in the input text.
2129 for (;;) {
2130 // Move all of the positions forward in the input string.
2131 p0 = p1; c0 = c1;
2132 p1 = p2; c1 = c2;
2133 p2 = p3; c2 = c3;
2134
2135 // Advancd p3 by one codepoint
2136 p3 = fText->moveIndex32(p3, 1);
2137 c3 = fText->char32At(p3);
2138
2139 if (p1 == p2) {
2140 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2141 continue;
2142 }
2143 if (p2 == fText->length()) {
2144 // Reached end of string. Always a break position.
2145 break;
2146 }
2147
2148 // Rule GB3 CR x LF
2149 // No Extend or Format characters may appear between the CR and LF,
2150 // which requires the additional check for p2 immediately following p1.
2151 //
2152 if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
2153 continue;
2154 }
2155
2156 // Rule (GB4). ( Control | CR | LF ) <break>
2157 if (fControlSet->contains(c1) ||
2158 c1 == 0x0D ||
2159 c1 == 0x0A) {
2160 break;
2161 }
2162
2163 // Rule (GB5) <break> ( Control | CR | LF )
2164 //
2165 if (fControlSet->contains(c2) ||
2166 c2 == 0x0D ||
2167 c2 == 0x0A) {
2168 break;
2169 }
2170
2171
2172 // Rule (GB6) L x ( L | V | LV | LVT )
2173 if (fLSet->contains(c1) &&
2174 (fLSet->contains(c2) ||
2175 fVSet->contains(c2) ||
2176 fLVSet->contains(c2) ||
2177 fLVTSet->contains(c2))) {
2178 continue;
2179 }
2180
2181 // Rule (GB7) ( LV | V ) x ( V | T )
2182 if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
2183 (fVSet->contains(c2) || fTSet->contains(c2))) {
2184 continue;
2185 }
2186
2187 // Rule (GB8) ( LVT | T) x T
2188 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
2189 fTSet->contains(c2)) {
2190 continue;
2191 }
2192
2193 // Rule (GB9) x (Extend | ZWJ)
2194 if (fExtendSet->contains(c2) || fZWJSet->contains(c2)) {
2195 if (!fExtendSet->contains(c1)) {
2196 cBase = c1;
2197 }
2198 continue;
2199 }
2200
2201 // Rule (GB9a) x SpacingMark
2202 if (fSpacingSet->contains(c2)) {
2203 continue;
2204 }
2205
2206 // Rule (GB9b) Prepend x
2207 if (fPrependSet->contains(c1)) {
2208 continue;
2209 }
2210
2211 // Rule (GB10) ($E_Base | $GAZ) $Extend* $E_Modifier;
2212 if ((fEmojiBaseSet->contains(c1) || fGAZSet->contains(c1)) && fEmojiModifierSet->contains(c2)) {
2213 continue;
2214 }
2215 if ((fEmojiBaseSet->contains(cBase) || fGAZSet->contains(cBase)) &&
2216 fExtendSet->contains(c1) && fEmojiModifierSet->contains(c2)) {
2217 continue;
2218 }
2219
2220 // Rule (GB11) ZWJ x Glue_After_Zwj
2221 if (fZWJSet->contains(c1) && fGAZSet->contains(c2)) {
2222 continue;
2223 }
2224
2225 // Rule (GB12-13) Regional_Indicator x Regional_Indicator
2226 // Note: The first if condition is a little tricky. We only need to force
2227 // a break if there are three or more contiguous RIs. If there are
2228 // only two, a break following will occur via other rules, and will include
2229 // any trailing extend characters, which is needed behavior.
2230 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
2231 && fRegionalIndicatorSet->contains(c2)) {
2232 break;
2233 }
2234 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2235 continue;
2236 }
2237
2238 // Rule (GB999) Any <break> Any
2239 break;
2240 }
2241
2242 breakPos = p2;
2243 return breakPos;
2244 }
2245
2246
2247
2248 UVector *RBBICharMonkey::charClasses() {
2249 return fSets;
2250 }
2251
2252
2253 RBBICharMonkey::~RBBICharMonkey() {
2254 delete fSets;
2255 delete fCRLFSet;
2256 delete fControlSet;
2257 delete fExtendSet;
2258 delete fRegionalIndicatorSet;
2259 delete fPrependSet;
2260 delete fSpacingSet;
2261 delete fLSet;
2262 delete fVSet;
2263 delete fTSet;
2264 delete fLVSet;
2265 delete fLVTSet;
2266 delete fHangulSet;
2267 delete fAnySet;
2268 delete fEmojiBaseSet;
2269 delete fEmojiModifierSet;
2270 delete fZWJSet;
2271 delete fGAZSet;
2272 }
2273
2274 //------------------------------------------------------------------------------------------
2275 //
2276 // class RBBIWordMonkey Word Break specific implementation
2277 // of RBBIMonkeyKind.
2278 //
2279 //------------------------------------------------------------------------------------------
2280 class RBBIWordMonkey: public RBBIMonkeyKind {
2281 public:
2282 RBBIWordMonkey();
2283 virtual ~RBBIWordMonkey();
2284 virtual UVector *charClasses();
2285 virtual void setText(const UnicodeString &s);
2286 virtual int32_t next(int32_t i);
2287 private:
2288 UVector *fSets;
2289
2290 UnicodeSet *fCRSet;
2291 UnicodeSet *fLFSet;
2292 UnicodeSet *fNewlineSet;
2293 UnicodeSet *fRegionalIndicatorSet;
2294 UnicodeSet *fKatakanaSet;
2295 UnicodeSet *fHebrew_LetterSet;
2296 UnicodeSet *fALetterSet;
2297 // TODO(jungshik): Do we still need this change?
2298 // UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt
2299 UnicodeSet *fSingle_QuoteSet;
2300 UnicodeSet *fDouble_QuoteSet;
2301 UnicodeSet *fMidNumLetSet;
2302 UnicodeSet *fMidLetterSet;
2303 UnicodeSet *fMidNumSet;
2304 UnicodeSet *fNumericSet;
2305 UnicodeSet *fFormatSet;
2306 UnicodeSet *fOtherSet;
2307 UnicodeSet *fExtendSet;
2308 UnicodeSet *fExtendNumLetSet;
2309 UnicodeSet *fDictionaryCjkSet;
2310 UnicodeSet *fEBaseSet;
2311 UnicodeSet *fEModifierSet;
2312 UnicodeSet *fZWSSet;
2313 UnicodeSet *fGAZSet;
2314
2315 const UnicodeString *fText;
2316 };
2317
2318
2319 RBBIWordMonkey::RBBIWordMonkey()
2320 {
2321 UErrorCode status = U_ZERO_ERROR;
2322
2323 fSets = new UVector(status);
2324
2325 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status);
2326 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status);
2327 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status);
2328 fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status);
2329 // Exclude Hangul syllables from ALetterSet during testing.
2330 // Leave CJK dictionary characters out from the monkey tests!
2331 #if 0
2332 fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}"
2333 "[\\p{Line_Break = Complex_Context}"
2334 "-\\p{Grapheme_Cluster_Break = Extend}"
2335 "-\\p{Grapheme_Cluster_Break = Control}"
2336 "]]",
2337 status);
2338 #endif
2339 fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status);
2340 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status);
2341 fHebrew_LetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Hebrew_Letter}]"), status);
2342 fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
2343 fALetterSet->removeAll(*fDictionaryCjkSet);
2344 fSingle_QuoteSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Single_Quote}]"), status);
2345 fDouble_QuoteSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Double_Quote}]"), status);
2346 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status);
2347 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter} - [\\:]]"), status);
2348 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status);
2349 // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test
2350 // we should figure out why
2351 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status);
2352 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status);
2353 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
2354 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status);
2355
2356 fEBaseSet = new UnicodeSet(UnicodeString(
2357 "[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C3-\\U0001F3C4\\U0001F3CA-\\U0001F3CB\\U0001F442-\\U0001F443"
2358 "\\U0001F446-\\U0001F450\\U0001F466-\\U0001F469\\U0001F46E\\U0001F470-\\U0001F478\\U0001F47C\\U0001F481-\\U0001F483"
2359 "\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F575\\U0001F590\\U0001F595-\\U0001F596\\U0001F645-\\U0001F647"
2360 "\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F918]"), status);
2361
2362 fEModifierSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\U0001F3FB-\\U0001F3FF]"), status);
2363 fZWSSet = new UnicodeSet((UChar32)0x200D, (UChar32)0x200D);;
2364 fGAZSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u2640\\u2642\\u2764\\U0001F308\\U0001F466-\\U0001F469\\U0001F48B\\U0001F5E8]"), status);
2365 fExtendSet->removeAll(*fZWSSet);
2366
2367
2368 fOtherSet = new UnicodeSet();
2369 if(U_FAILURE(status)) {
2370 deferredStatus = status;
2371 return;
2372 }
2373
2374 fOtherSet->complement();
2375 fOtherSet->removeAll(*fCRSet);
2376 fOtherSet->removeAll(*fLFSet);
2377 fOtherSet->removeAll(*fNewlineSet);
2378 fOtherSet->removeAll(*fKatakanaSet);
2379 fOtherSet->removeAll(*fHebrew_LetterSet);
2380 fOtherSet->removeAll(*fALetterSet);
2381 fOtherSet->removeAll(*fSingle_QuoteSet);
2382 fOtherSet->removeAll(*fDouble_QuoteSet);
2383 fOtherSet->removeAll(*fMidLetterSet);
2384 fOtherSet->removeAll(*fMidNumSet);
2385 fOtherSet->removeAll(*fNumericSet);
2386 fOtherSet->removeAll(*fExtendNumLetSet);
2387 fOtherSet->removeAll(*fFormatSet);
2388 fOtherSet->removeAll(*fExtendSet);
2389 fOtherSet->removeAll(*fRegionalIndicatorSet);
2390 fOtherSet->removeAll(*fEBaseSet);
2391 fOtherSet->removeAll(*fEModifierSet);
2392 fOtherSet->removeAll(*fZWSSet);
2393 fOtherSet->removeAll(*fGAZSet);
2394
2395 // Inhibit dictionary characters from being tested at all.
2396 fOtherSet->removeAll(*fDictionaryCjkSet);
2397 fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
2398
2399 fSets->addElement(fCRSet, status);
2400 fSets->addElement(fLFSet, status);
2401 fSets->addElement(fNewlineSet, status);
2402 fSets->addElement(fRegionalIndicatorSet, status);
2403 fSets->addElement(fHebrew_LetterSet, status);
2404 fSets->addElement(fALetterSet, status);
2405 fSets->addElement(fSingle_QuoteSet, status);
2406 fSets->addElement(fDouble_QuoteSet, status);
2407 //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test katakana
2408 fSets->addElement(fMidLetterSet, status);
2409 fSets->addElement(fMidNumLetSet, status);
2410 fSets->addElement(fMidNumSet, status);
2411 fSets->addElement(fNumericSet, status);
2412 fSets->addElement(fFormatSet, status);
2413 fSets->addElement(fExtendSet, status);
2414 fSets->addElement(fOtherSet, status);
2415 fSets->addElement(fExtendNumLetSet, status);
2416
2417 fSets->addElement(fEBaseSet, status);
2418 fSets->addElement(fEModifierSet, status);
2419 fSets->addElement(fZWSSet, status);
2420 fSets->addElement(fGAZSet, status);
2421
2422 if (U_FAILURE(status)) {
2423 deferredStatus = status;
2424 }
2425 }
2426
2427 void RBBIWordMonkey::setText(const UnicodeString &s) {
2428 fText = &s;
2429 }
2430
2431
2432 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2433 int p0, p1, p2, p3; // Indices of the significant code points around the
2434 // break position being tested. The candidate break
2435 // location is before p2.
2436
2437 int breakPos = -1;
2438
2439 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2440
2441 if (U_FAILURE(deferredStatus)) {
2442 return -1;
2443 }
2444
2445 // Prev break at end of string. return DONE.
2446 if (prevPos >= fText->length()) {
2447 return -1;
2448 }
2449 p0 = p1 = p2 = p3 = prevPos;
2450 c3 = fText->char32At(prevPos);
2451 c0 = c1 = c2 = 0;
2452 (void)p0; // Suppress set but not used warning.
2453
2454 // Loop runs once per "significant" character position in the input text.
2455 for (;;) {
2456 // Move all of the positions forward in the input string.
2457 p0 = p1; c0 = c1;
2458 p1 = p2; c1 = c2;
2459 p2 = p3; c2 = c3;
2460
2461 // Advancd p3 by X(Extend | Format)* Rule 4
2462 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2463 do {
2464 p3 = fText->moveIndex32(p3, 1);
2465 c3 = fText->char32At(p3);
2466 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2467 break;
2468 };
2469 }
2470 while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWSSet->contains(c3));
2471
2472
2473 if (p1 == p2) {
2474 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2475 continue;
2476 }
2477 if (p2 == fText->length()) {
2478 // Reached end of string. Always a break position.
2479 break;
2480 }
2481
2482 // Rule (3) CR x LF
2483 // No Extend or Format characters may appear between the CR and LF,
2484 // which requires the additional check for p2 immediately following p1.
2485 //
2486 if (c1==0x0D && c2==0x0A) {
2487 continue;
2488 }
2489
2490 // Rule (3a) Break before and after newlines (including CR and LF)
2491 //
2492 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2493 break;
2494 };
2495 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2496 break;
2497 };
2498
2499 // Rule (3c) ZWJ x GAZ (Glue after ZWJ).
2500 // Not ignoring extend chars, so peek into input text to
2501 // get the potential ZWJ, the character immediately preceding c2.
2502 // Sloppy UChar32 indexing: p2-1 may reference trail half
2503 // but char32At will get the full code point.
2504 if (fZWSSet->contains(fText->char32At(p2-1)) && fGAZSet->contains(c2)) {
2505 continue;
2506 }
2507
2508 // Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
2509 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2510 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2511 continue;
2512 }
2513
2514 // Rule (6) (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
2515 //
2516 if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2517 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2518 (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2519 continue;
2520 }
2521
2522 // Rule (7) (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter)
2523 if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2524 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2525 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2526 continue;
2527 }
2528
2529 // Rule (7a) Hebrew_Letter x Single_Quote
2530 if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2531 continue;
2532 }
2533
2534 // Rule (7b) Hebrew_Letter x Double_Quote Hebrew_Letter
2535 if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2536 continue;
2537 }
2538
2539 // Rule (7c) Hebrew_Letter Double_Quote x Hebrew_Letter
2540 if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2541 continue;
2542 }
2543
2544 // Rule (8) Numeric x Numeric
2545 if (fNumericSet->contains(c1) &&
2546 fNumericSet->contains(c2)) {
2547 continue;
2548 }
2549
2550 // Rule (9) (ALetter | Hebrew_Letter) x Numeric
2551 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2552 fNumericSet->contains(c2)) {
2553 continue;
2554 }
2555
2556 // Rule (10) Numeric x (ALetter | Hebrew_Letter)
2557 if (fNumericSet->contains(c1) &&
2558 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2559 continue;
2560 }
2561
2562 // Rule (11) Numeric (MidNum | MidNumLet | Single_Quote) x Numeric
2563 if (fNumericSet->contains(c0) &&
2564 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2565 fNumericSet->contains(c2)) {
2566 continue;
2567 }
2568
2569 // Rule (12) Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
2570 if (fNumericSet->contains(c1) &&
2571 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2572 fNumericSet->contains(c3)) {
2573 continue;
2574 }
2575
2576 // Rule (13) Katakana x Katakana
2577 if (fKatakanaSet->contains(c1) &&
2578 fKatakanaSet->contains(c2)) {
2579 continue;
2580 }
2581
2582 // Rule 13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
2583 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2584 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2585 fExtendNumLetSet->contains(c2)) {
2586 continue;
2587 }
2588
2589 // Rule 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
2590 if (fExtendNumLetSet->contains(c1) &&
2591 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2592 fNumericSet->contains(c2) || fKatakanaSet->contains(c2))) {
2593 continue;
2594 }
2595
2596 // Rule 13c
2597 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
2598 break;
2599 }
2600 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2601 continue;
2602 }
2603
2604 // Rule 13d
2605 if ((fEBaseSet->contains(c1) || fGAZSet->contains(c1)) && fEModifierSet->contains(c2)) {
2606 continue;
2607 }
2608
2609 // Rule 14. Break found here.
2610 break;
2611 }
2612
2613 breakPos = p2;
2614 return breakPos;
2615 }
2616
2617
2618 UVector *RBBIWordMonkey::charClasses() {
2619 return fSets;
2620 }
2621
2622
2623 RBBIWordMonkey::~RBBIWordMonkey() {
2624 delete fSets;
2625 delete fCRSet;
2626 delete fLFSet;
2627 delete fNewlineSet;
2628 delete fKatakanaSet;
2629 delete fHebrew_LetterSet;
2630 delete fALetterSet;
2631 delete fSingle_QuoteSet;
2632 delete fDouble_QuoteSet;
2633 delete fMidNumLetSet;
2634 delete fMidLetterSet;
2635 delete fMidNumSet;
2636 delete fNumericSet;
2637 delete fFormatSet;
2638 delete fExtendSet;
2639 delete fExtendNumLetSet;
2640 delete fRegionalIndicatorSet;
2641 delete fDictionaryCjkSet;
2642 delete fOtherSet;
2643 delete fEBaseSet;
2644 delete fEModifierSet;
2645 delete fZWSSet;
2646 delete fGAZSet;
2647 }
2648
2649
2650
2651
2652 //------------------------------------------------------------------------------------------
2653 //
2654 // class RBBISentMonkey Sentence Break specific implementation
2655 // of RBBIMonkeyKind.
2656 //
2657 //------------------------------------------------------------------------------------------
2658 class RBBISentMonkey: public RBBIMonkeyKind {
2659 public:
2660 RBBISentMonkey();
2661 virtual ~RBBISentMonkey();
2662 virtual UVector *charClasses();
2663 virtual void setText(const UnicodeString &s);
2664 virtual int32_t next(int32_t i);
2665 private:
2666 int moveBack(int posFrom);
2667 int moveForward(int posFrom);
2668 UChar32 cAt(int pos);
2669
2670 UVector *fSets;
2671
2672 UnicodeSet *fSepSet;
2673 UnicodeSet *fFormatSet;
2674 UnicodeSet *fSpSet;
2675 UnicodeSet *fLowerSet;
2676 UnicodeSet *fUpperSet;
2677 UnicodeSet *fOLetterSet;
2678 UnicodeSet *fNumericSet;
2679 UnicodeSet *fATermSet;
2680 UnicodeSet *fSContinueSet;
2681 UnicodeSet *fSTermSet;
2682 UnicodeSet *fCloseSet;
2683 UnicodeSet *fOtherSet;
2684 UnicodeSet *fExtendSet;
2685
2686 const UnicodeString *fText;
2687
2688 };
2689
2690 RBBISentMonkey::RBBISentMonkey()
2691 {
2692 UErrorCode status = U_ZERO_ERROR;
2693
2694 fSets = new UVector(status);
2695
2696 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
2697 // set and made into character classes of their own. For the monkey impl,
2698 // they remain in SEP, since Sep always appears with CR and LF in the rules.
2699 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status);
2700 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status);
2701 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status);
2702 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status);
2703 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status);
2704 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status);
2705 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status);
2706 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status);
2707 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2708 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status);
2709 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status);
2710 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status);
2711 fOtherSet = new UnicodeSet();
2712
2713 if(U_FAILURE(status)) {
2714 deferredStatus = status;
2715 return;
2716 }
2717
2718 fOtherSet->complement();
2719 fOtherSet->removeAll(*fSepSet);
2720 fOtherSet->removeAll(*fFormatSet);
2721 fOtherSet->removeAll(*fSpSet);
2722 fOtherSet->removeAll(*fLowerSet);
2723 fOtherSet->removeAll(*fUpperSet);
2724 fOtherSet->removeAll(*fOLetterSet);
2725 fOtherSet->removeAll(*fNumericSet);
2726 fOtherSet->removeAll(*fATermSet);
2727 fOtherSet->removeAll(*fSContinueSet);
2728 fOtherSet->removeAll(*fSTermSet);
2729 fOtherSet->removeAll(*fCloseSet);
2730 fOtherSet->removeAll(*fExtendSet);
2731
2732 fSets->addElement(fSepSet, status);
2733 fSets->addElement(fFormatSet, status);
2734 fSets->addElement(fSpSet, status);
2735 fSets->addElement(fLowerSet, status);
2736 fSets->addElement(fUpperSet, status);
2737 fSets->addElement(fOLetterSet, status);
2738 fSets->addElement(fNumericSet, status);
2739 fSets->addElement(fATermSet, status);
2740 fSets->addElement(fSContinueSet, status);
2741 fSets->addElement(fSTermSet, status);
2742 fSets->addElement(fCloseSet, status);
2743 fSets->addElement(fOtherSet, status);
2744 fSets->addElement(fExtendSet, status);
2745
2746 if (U_FAILURE(status)) {
2747 deferredStatus = status;
2748 }
2749 }
2750
2751
2752
2753 void RBBISentMonkey::setText(const UnicodeString &s) {
2754 fText = &s;
2755 }
2756
2757 UVector *RBBISentMonkey::charClasses() {
2758 return fSets;
2759 }
2760
2761
2762 // moveBack() Find the "significant" code point preceding the index i.
2763 // Skips over ($Extend | $Format)* .
2764 //
2765 int RBBISentMonkey::moveBack(int i) {
2766 if (i <= 0) {
2767 return -1;
2768 }
2769 UChar32 c;
2770 int32_t j = i;
2771 do {
2772 j = fText->moveIndex32(j, -1);
2773 c = fText->char32At(j);
2774 }
2775 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2776 return j;
2777
2778 }
2779
2780
2781 int RBBISentMonkey::moveForward(int i) {
2782 if (i>=fText->length()) {
2783 return fText->length();
2784 }
2785 UChar32 c;
2786 int32_t j = i;
2787 do {
2788 j = fText->moveIndex32(j, 1);
2789 c = cAt(j);
2790 }
2791 while (fFormatSet->contains(c) || fExtendSet->contains(c));
2792 return j;
2793 }
2794
2795 UChar32 RBBISentMonkey::cAt(int pos) {
2796 if (pos<0 || pos>=fText->length()) {
2797 return -1;
2798 } else {
2799 return fText->char32At(pos);
2800 }
2801 }
2802
2803 int32_t RBBISentMonkey::next(int32_t prevPos) {
2804 int p0, p1, p2, p3; // Indices of the significant code points around the
2805 // break position being tested. The candidate break
2806 // location is before p2.
2807
2808 int breakPos = -1;
2809
2810 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2811 UChar32 c;
2812
2813 if (U_FAILURE(deferredStatus)) {
2814 return -1;
2815 }
2816
2817 // Prev break at end of string. return DONE.
2818 if (prevPos >= fText->length()) {
2819 return -1;
2820 }
2821 p0 = p1 = p2 = p3 = prevPos;
2822 c3 = fText->char32At(prevPos);
2823 c0 = c1 = c2 = 0;
2824 (void)p0; // Suppress set but not used warning.
2825
2826 // Loop runs once per "significant" character position in the input text.
2827 for (;;) {
2828 // Move all of the positions forward in the input string.
2829 p0 = p1; c0 = c1;
2830 p1 = p2; c1 = c2;
2831 p2 = p3; c2 = c3;
2832
2833 // Advancd p3 by X(Extend | Format)* Rule 4
2834 p3 = moveForward(p3);
2835 c3 = cAt(p3);
2836
2837 // Rule (3) CR x LF
2838 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2839 continue;
2840 }
2841
2842 // Rule (4). Sep <break>
2843 if (fSepSet->contains(c1)) {
2844 p2 = p1+1; // Separators don't combine with Extend or Format.
2845 break;
2846 }
2847
2848 if (p2 >= fText->length()) {
2849 // Reached end of string. Always a break position.
2850 break;
2851 }
2852
2853 if (p2 == prevPos) {
2854 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2855 continue;
2856 }
2857
2858 // Rule (6). ATerm x Numeric
2859 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) {
2860 continue;
2861 }
2862
2863 // Rule (7). (Upper | Lower) ATerm x Uppper
2864 if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
2865 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2866 continue;
2867 }
2868
2869 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2870 // Note: STerm | ATerm are added to the negated part of the expression by a
2871 // note to the Unicode 5.0 documents.
2872 int p8 = p1;
2873 while (fSpSet->contains(cAt(p8))) {
2874 p8 = moveBack(p8);
2875 }
2876 while (fCloseSet->contains(cAt(p8))) {
2877 p8 = moveBack(p8);
2878 }
2879 if (fATermSet->contains(cAt(p8))) {
2880 p8=p2;
2881 for (;;) {
2882 c = cAt(p8);
2883 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2884 fLowerSet->contains(c) || fSepSet->contains(c) ||
2885 fATermSet->contains(c) || fSTermSet->contains(c)) {
2886 break;
2887 }
2888 p8 = moveForward(p8);
2889 }
2890 if (fLowerSet->contains(cAt(p8))) {
2891 continue;
2892 }
2893 }
2894
2895 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2896 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2897 p8 = p1;
2898 while (fSpSet->contains(cAt(p8))) {
2899 p8 = moveBack(p8);
2900 }
2901 while (fCloseSet->contains(cAt(p8))) {
2902 p8 = moveBack(p8);
2903 }
2904 c = cAt(p8);
2905 if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2906 continue;
2907 }
2908 }
2909
2910 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)
2911 int p9 = p1;
2912 while (fCloseSet->contains(cAt(p9))) {
2913 p9 = moveBack(p9);
2914 }
2915 c = cAt(p9);
2916 if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2917 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2918 continue;
2919 }
2920 }
2921
2922 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)
2923 int p10 = p1;
2924 while (fSpSet->contains(cAt(p10))) {
2925 p10 = moveBack(p10);
2926 }
2927 while (fCloseSet->contains(cAt(p10))) {
2928 p10 = moveBack(p10);
2929 }
2930 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2931 if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2932 continue;
2933 }
2934 }
2935
2936 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break>
2937 int p11 = p1;
2938 if (fSepSet->contains(cAt(p11))) {
2939 p11 = moveBack(p11);
2940 }
2941 while (fSpSet->contains(cAt(p11))) {
2942 p11 = moveBack(p11);
2943 }
2944 while (fCloseSet->contains(cAt(p11))) {
2945 p11 = moveBack(p11);
2946 }
2947 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2948 break;
2949 }
2950
2951 // Rule (12) Any x Any
2952 continue;
2953 }
2954 breakPos = p2;
2955 return breakPos;
2956 }
2957
2958 RBBISentMonkey::~RBBISentMonkey() {
2959 delete fSets;
2960 delete fSepSet;
2961 delete fFormatSet;
2962 delete fSpSet;
2963 delete fLowerSet;
2964 delete fUpperSet;
2965 delete fOLetterSet;
2966 delete fNumericSet;
2967 delete fATermSet;
2968 delete fSContinueSet;
2969 delete fSTermSet;
2970 delete fCloseSet;
2971 delete fOtherSet;
2972 delete fExtendSet;
2973 }
2974
2975
2976
2977 //-------------------------------------------------------------------------------------------
2978 //
2979 // RBBILineMonkey
2980 //
2981 //-------------------------------------------------------------------------------------------
2982
2983 class RBBILineMonkey: public RBBIMonkeyKind {
2984 public:
2985 RBBILineMonkey();
2986 virtual ~RBBILineMonkey();
2987 virtual UVector *charClasses();
2988 virtual void setText(const UnicodeString &s);
2989 virtual int32_t next(int32_t i);
2990 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2991 private:
2992 UVector *fSets;
2993
2994 UnicodeSet *fBK;
2995 UnicodeSet *fCR;
2996 UnicodeSet *fLF;
2997 UnicodeSet *fCM;
2998 UnicodeSet *fNL;
2999 UnicodeSet *fSG;
3000 UnicodeSet *fWJ;
3001 UnicodeSet *fZW;
3002 UnicodeSet *fGL;
3003 UnicodeSet *fCB;
3004 UnicodeSet *fSP;
3005 UnicodeSet *fB2;
3006 UnicodeSet *fBA;
3007 UnicodeSet *fBB;
3008 UnicodeSet *fHY;
3009 UnicodeSet *fH2;
3010 UnicodeSet *fH3;
3011 UnicodeSet *fCL;
3012 UnicodeSet *fCP;
3013 UnicodeSet *fEX;
3014 UnicodeSet *fIN;
3015 UnicodeSet *fJL;
3016 UnicodeSet *fJV;
3017 UnicodeSet *fJT;
3018 UnicodeSet *fNS;
3019 UnicodeSet *fOP;
3020 UnicodeSet *fQU;
3021 UnicodeSet *fIS;
3022 UnicodeSet *fNU;
3023 UnicodeSet *fPO;
3024 UnicodeSet *fPR;
3025 UnicodeSet *fSY;
3026 UnicodeSet *fAI;
3027 UnicodeSet *fAL;
3028 UnicodeSet *fCJ;
3029 UnicodeSet *fHL;
3030 UnicodeSet *fID;
3031 UnicodeSet *fRI;
3032 UnicodeSet *fXX;
3033 UnicodeSet *fEB;
3034 UnicodeSet *fEM;
3035 UnicodeSet *fZJ;
3036
3037 BreakIterator *fCharBI;
3038 const UnicodeString *fText;
3039 RegexMatcher *fNumberMatcher;
3040 };
3041
3042 RBBILineMonkey::RBBILineMonkey() :
3043 RBBIMonkeyKind(),
3044 fSets(NULL),
3045
3046 fCharBI(NULL),
3047 fText(NULL),
3048 fNumberMatcher(NULL)
3049
3050 {
3051 if (U_FAILURE(deferredStatus)) {
3052 return;
3053 }
3054
3055 UErrorCode status = U_ZERO_ERROR;
3056
3057 fSets = new UVector(status);
3058
3059 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
3060 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
3061 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
3062 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
3063 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
3064 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
3065 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
3066 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
3067 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
3068 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
3069 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
3070 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
3071 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
3072 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
3073 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
3074 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
3075 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
3076 fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
3077 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
3078 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
3079 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
3080 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
3081 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
3082 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
3083 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
3084 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
3085 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
3086 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
3087 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
3088 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
3089 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
3090 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
3091 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
3092 fCJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
3093 fHL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
3094 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
3095 fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
3096 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
3097 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
3098 fEB = new UnicodeSet(UnicodeString(
3099 "[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C3-\\U0001F3C4\\U0001F3CA-\\U0001F3CB\\U0001F442-\\U0001F443"
3100 "\\U0001F446-\\U0001F450\\U0001F466-\\U0001F469\\U0001F46E\\U0001F470-\\U0001F478\\U0001F47C\\U0001F481-\\U0001F483"
3101 "\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F575\\U0001F590\\U0001F595-\\U0001F596\\U0001F645-\\U0001F647"
3102 "\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F918]"), status);
3103 fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\U0001F3FB-\\U0001F3FF]"), status);
3104 fZJ = new UnicodeSet((UChar32)0x200D, (UChar32)0x200D);
3105
3106 if (U_FAILURE(status)) {
3107 deferredStatus = status;
3108 return;
3109 }
3110
3111 fAL->addAll(*fXX); // Default behavior for XX is identical to AL
3112 fAL->addAll(*fAI); // Default behavior for AI is identical to AL
3113 fAL->addAll(*fSG); // Default behavior for SG is identical to AL.
3114
3115 fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS.
3116
3117 fID->addAll(*fEB); // Emoji Base and Emoji Modifier behave as ID.
3118 fID->addAll(*fEM);
3119 fAL->removeAll(*fEM);
3120
3121
3122 fAL->remove((UChar32)0x2764); // Emoji Proposal: move u2764 from Al to Id
3123 fAI->remove((UChar32)0x2640); // new ZWJ seqs
3124 fAI->remove((UChar32)0x2642); // new ZWJ seqs
3125 fID->add((UChar32)0x2764);
3126 fID->add((UChar32)0x2640);
3127 fID->add((UChar32)0x2642);
3128
3129 fSets->addElement(fBK, status);
3130 fSets->addElement(fCR, status);
3131 fSets->addElement(fLF, status);
3132 fSets->addElement(fCM, status);
3133 fSets->addElement(fNL, status);
3134 fSets->addElement(fWJ, status);
3135 fSets->addElement(fZW, status);
3136 fSets->addElement(fGL, status);
3137 fSets->addElement(fCB, status);
3138 fSets->addElement(fSP, status);
3139 fSets->addElement(fB2, status);
3140 fSets->addElement(fBA, status);
3141 fSets->addElement(fBB, status);
3142 fSets->addElement(fHY, status);
3143 fSets->addElement(fH2, status);
3144 fSets->addElement(fH3, status);
3145 fSets->addElement(fCL, status);
3146 fSets->addElement(fCP, status);
3147 fSets->addElement(fEX, status);
3148 fSets->addElement(fIN, status);
3149 fSets->addElement(fJL, status);
3150 fSets->addElement(fJT, status);
3151 fSets->addElement(fJV, status);
3152 fSets->addElement(fNS, status);
3153 fSets->addElement(fOP, status);
3154 fSets->addElement(fQU, status);
3155 fSets->addElement(fIS, status);
3156 fSets->addElement(fNU, status);
3157 fSets->addElement(fPO, status);
3158 fSets->addElement(fPR, status);
3159 fSets->addElement(fSY, status);
3160 fSets->addElement(fAI, status);
3161 fSets->addElement(fAL, status);
3162 fSets->addElement(fHL, status);
3163 fSets->addElement(fID, status);
3164 fSets->addElement(fWJ, status);
3165 fSets->addElement(fRI, status);
3166 fSets->addElement(fSG, status);
3167 fSets->addElement(fEB, status);
3168 fSets->addElement(fEM, status);
3169 fSets->addElement(fZJ, status);
3170
3171 const char *rules =
3172 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
3173 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
3174 "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
3175 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
3176 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
3177 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
3178
3179 fNumberMatcher = new RegexMatcher(
3180 UnicodeString(rules, -1, US_INV), 0, status);
3181
3182 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
3183
3184 if (U_FAILURE(status)) {
3185 deferredStatus = status;
3186 }
3187 }
3188
3189
3190 void RBBILineMonkey::setText(const UnicodeString &s) {
3191 fText = &s;
3192 fCharBI->setText(s);
3193 fNumberMatcher->reset(s);
3194 }
3195
3196 //
3197 // rule9Adjust
3198 // Line Break TR rules 9 and 10 implementation.
3199 // This deals with combining marks and other sequences that
3200 // that must be treated as if they were something other than what they actually are.
3201 //
3202 // This is factored out into a separate function because it must be applied twice for
3203 // each potential break, once to the chars before the position being checked, then
3204 // again to the text following the possible break.
3205 //
3206 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
3207 if (pos == -1) {
3208 // Invalid initial position. Happens during the warmup iteration of the
3209 // main loop in next().
3210 return;
3211 }
3212
3213 int32_t nPos = *nextPos;
3214
3215 // LB 9 Keep combining sequences together.
3216 // advance over any CM class chars. Note that Line Break CM is different
3217 // from the normal Grapheme Extend property.
3218 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
3219 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
3220 for (;;) {
3221 *nextChar = fText->char32At(nPos);
3222 if (!fCM->contains(*nextChar)) {
3223 break;
3224 }
3225 nPos = fText->moveIndex32(nPos, 1);
3226 }
3227 }
3228
3229
3230 // LB 9 Treat X CM* as if it were x.
3231 // No explicit action required.
3232
3233 // LB 10 Treat any remaining combining mark as AL
3234 if (fCM->contains(*posChar)) {
3235 *posChar = 0x41; // thisChar = 'A';
3236 }
3237
3238 // Push the updated nextPos and nextChar back to our caller.
3239 // This only makes a difference if posChar got bigger by consuming a
3240 // combining sequence.
3241 *nextPos = nPos;
3242 *nextChar = fText->char32At(nPos);
3243 }
3244
3245
3246
3247 int32_t RBBILineMonkey::next(int32_t startPos) {
3248 UErrorCode status = U_ZERO_ERROR;
3249 int32_t pos; // Index of the char following a potential break position
3250 UChar32 thisChar; // Character at above position "pos"
3251
3252 int32_t prevPos; // Index of the char preceding a potential break position
3253 UChar32 prevChar; // Character at above position. Note that prevChar
3254 // and thisChar may not be adjacent because combining
3255 // characters between them will be ignored.
3256
3257 int32_t prevPosX2; // Second previous character. Wider context for LB21a.
3258 UChar32 prevCharX2;
3259
3260 int32_t nextPos; // Index of the next character following pos.
3261 // Usually skips over combining marks.
3262 int32_t nextCPPos; // Index of the code point following "pos."
3263 // May point to a combining mark.
3264 int32_t tPos; // temp value.
3265 UChar32 c;
3266
3267 if (U_FAILURE(deferredStatus)) {
3268 return -1;
3269 }
3270
3271 if (startPos >= fText->length()) {
3272 return -1;
3273 }
3274
3275
3276 // Initial values for loop. Loop will run the first time without finding breaks,
3277 // while the invalid values shift out and the "this" and
3278 // "prev" positions are filled in with good values.
3279 pos = prevPos = prevPosX2 = -1; // Invalid value, serves as flag for initial loop iteration.
3280 thisChar = prevChar = prevCharX2 = 0;
3281 nextPos = nextCPPos = startPos;
3282
3283
3284 // Loop runs once per position in the test text, until a break position
3285 // is found.
3286 for (;;) {
3287 prevPosX2 = prevPos;
3288 prevCharX2 = prevChar;
3289
3290 prevPos = pos;
3291 prevChar = thisChar;
3292
3293 pos = nextPos;
3294 thisChar = fText->char32At(pos);
3295
3296 nextCPPos = fText->moveIndex32(pos, 1);
3297 nextPos = nextCPPos;
3298
3299 // Rule LB2 - Break at end of text.
3300 if (pos >= fText->length()) {
3301 break;
3302 }
3303
3304 // Rule LB 9 - adjust for combining sequences.
3305 // We do this one out-of-order because the adjustment does not change anything
3306 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3307 // be applied.
3308 rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
3309 nextCPPos = nextPos = fText->moveIndex32(pos, 1);
3310 c = fText->char32At(nextPos);
3311 rule9Adjust(pos, &thisChar, &nextPos, &c);
3312
3313 // If the loop is still warming up - if we haven't shifted the initial
3314 // -1 positions out of prevPos yet - loop back to advance the
3315 // position in the input without any further looking for breaks.
3316 if (prevPos == -1) {
3317 continue;
3318 }
3319
3320 // LB 4 Always break after hard line breaks,
3321 if (fBK->contains(prevChar)) {
3322 break;
3323 }
3324
3325 // LB 5 Break after CR, LF, NL, but not inside CR LF
3326 if (prevChar == 0x0d && thisChar == 0x0a) {
3327 continue;
3328 }
3329 if (prevChar == 0x0d ||
3330 prevChar == 0x0a ||
3331 prevChar == 0x85) {
3332 break;
3333 }
3334
3335 // LB 6 Don't break before hard line breaks
3336 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3337 fBK->contains(thisChar)) {
3338 continue;
3339 }
3340
3341
3342 // LB 7 Don't break before spaces or zero-width space.
3343 if (fSP->contains(thisChar)) {
3344 continue;
3345 }
3346
3347 if (fZW->contains(thisChar)) {
3348 continue;
3349 }
3350
3351 // LB 8 Break after zero width space
3352 if (fZW->contains(prevChar)) {
3353 break;
3354 }
3355
3356 // LB 8a ZJ x ID
3357 // The monkey test's way of ignoring combining characters doesn't work
3358 // for this rule. ZJ is also a CM. Need to get the actual character
3359 // preceding "thisChar", not ignoring combining marks, possibly ZJ.
3360 {
3361 int32_t prevIdx = fText->moveIndex32(pos, -1);
3362 UChar32 prevC = fText->char32At(prevIdx);
3363 if (fZJ->contains(prevC) && fID->contains(thisChar)) {
3364 continue;
3365 }
3366 }
3367
3368 // LB 9, 10 Already done, at top of loop.
3369 //
3370
3371
3372 // LB 11 Do not break before or after WORD JOINER and related characters.
3373 // x WJ
3374 // WJ x
3375 //
3376 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3377 continue;
3378 }
3379
3380 // LB 12
3381 // GL x
3382 if (fGL->contains(prevChar)) {
3383 continue;
3384 }
3385
3386 // LB 12a
3387 // [^SP BA HY] x GL
3388 if (!(fSP->contains(prevChar) ||
3389 fBA->contains(prevChar) ||
3390 fHY->contains(prevChar) ) && fGL->contains(thisChar)) {
3391 continue;
3392 }
3393
3394
3395
3396 // LB 13 Don't break before closings.
3397 // NU x CL, NU x CP and NU x IS are not matched here so that they will
3398 // fall into LB 17 and the more general number regular expression.
3399 //
3400 if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
3401 (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
3402 fEX->contains(thisChar) ||
3403 (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
3404 (!fNU->contains(prevChar) && fSY->contains(thisChar))) {
3405 continue;
3406 }
3407
3408 // LB 14 Don't break after OP SP*
3409 // Scan backwards, checking for this sequence.
3410 // The OP char could include combining marks, so we actually check for
3411 // OP CM* SP*
3412 // Another Twist: The Rule 67 fixes may have changed a SP CM
3413 // sequence into a ID char, so before scanning back through spaces,
3414 // verify that prevChar is indeed a space. The prevChar variable
3415 // may differ from fText[prevPos]
3416 tPos = prevPos;
3417 if (fSP->contains(prevChar)) {
3418 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3419 tPos=fText->moveIndex32(tPos, -1);
3420 }
3421 }
3422 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3423 tPos=fText->moveIndex32(tPos, -1);
3424 }
3425 if (fOP->contains(fText->char32At(tPos))) {
3426 continue;
3427 }
3428
3429
3430 // LB 15 QU SP* x OP
3431 if (fOP->contains(thisChar)) {
3432 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3433 int tPos = prevPos;
3434 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3435 tPos = fText->moveIndex32(tPos, -1);
3436 }
3437 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3438 tPos = fText->moveIndex32(tPos, -1);
3439 }
3440 if (fQU->contains(fText->char32At(tPos))) {
3441 continue;
3442 }
3443 }
3444
3445
3446
3447 // LB 16 (CL | CP) SP* x NS
3448 // Scan backwards for SP* CM* (CL | CP)
3449 if (fNS->contains(thisChar)) {
3450 int tPos = prevPos;
3451 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3452 tPos = fText->moveIndex32(tPos, -1);
3453 }
3454 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3455 tPos = fText->moveIndex32(tPos, -1);
3456 }
3457 if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3458 continue;
3459 }
3460 }
3461
3462
3463 // LB 17 B2 SP* x B2
3464 if (fB2->contains(thisChar)) {
3465 // Scan backwards, checking for the B2 CM* SP* sequence.
3466 tPos = prevPos;
3467 if (fSP->contains(prevChar)) {
3468 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3469 tPos=fText->moveIndex32(tPos, -1);
3470 }
3471 }
3472 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3473 tPos=fText->moveIndex32(tPos, -1);
3474 }
3475 if (fB2->contains(fText->char32At(tPos))) {
3476 continue;
3477 }
3478 }
3479
3480
3481 // LB 18 break after space
3482 if (fSP->contains(prevChar)) {
3483 break;
3484 }
3485
3486 // LB 19
3487 // x QU
3488 // QU x
3489 if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3490 continue;
3491 }
3492
3493 // LB 20 Break around a CB
3494 if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3495 break;
3496 }
3497
3498 // LB 21
3499 if (fBA->contains(thisChar) ||
3500 fHY->contains(thisChar) ||
3501 fNS->contains(thisChar) ||
3502 fBB->contains(prevChar) ) {
3503 continue;
3504 }
3505
3506 // LB 21a
3507 // HL (HY | BA) x
3508 if (fHL->contains(prevCharX2) &&
3509 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3510 continue;
3511 }
3512
3513 // LB 21b
3514 // SY x HL
3515 if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3516 continue;
3517 }
3518
3519 // LB 22
3520 if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
3521 (fEX->contains(prevChar) && fIN->contains(thisChar)) ||
3522 (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
3523 (fID->contains(prevChar) && fIN->contains(thisChar)) ||
3524 (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
3525 (fNU->contains(prevChar) && fIN->contains(thisChar)) ) {
3526 continue;
3527 }
3528
3529
3530 // LB 23 ID x PO
3531 // AL x NU
3532 // HL x NU
3533 // NU x AL
3534 if ((fID->contains(prevChar) && fPO->contains(thisChar)) ||
3535 (fAL->contains(prevChar) && fNU->contains(thisChar)) ||
3536 (fHL->contains(prevChar) && fNU->contains(thisChar)) ||
3537 (fNU->contains(prevChar) && fAL->contains(thisChar)) ||
3538 (fNU->contains(prevChar) && fHL->contains(thisChar)) ) {
3539 continue;
3540 }
3541
3542 // LB 24 Do not break between prefix and letters or ideographs.
3543 // PR x ID
3544 // PR x (AL | HL)
3545 // PO x (AL | HL)
3546 // (AL | HL) x PR // Apple early addition
3547 // (AL | HL) x PO // Apple early addition
3548 if ((fPR->contains(prevChar) && fID->contains(thisChar)) ||
3549 (fPR->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) ||
3550 (fPO->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) ||
3551 ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fPR->contains(thisChar)) ||
3552 ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fPO->contains(thisChar)) ) {
3553 continue;
3554 }
3555
3556
3557
3558 // LB 25 Numbers
3559 if (fNumberMatcher->lookingAt(prevPos, status)) {
3560 if (U_FAILURE(status)) {
3561 break;
3562 }
3563 // Matched a number. But could have been just a single digit, which would
3564 // not represent a "no break here" between prevChar and thisChar
3565 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num
3566 if (numEndIdx > pos) {
3567 // Number match includes at least our two chars being checked
3568 if (numEndIdx > nextPos) {
3569 // Number match includes additional chars. Update pos and nextPos
3570 // so that next loop iteration will continue at the end of the number,
3571 // checking for breaks between last char in number & whatever follows.
3572 pos = nextPos = numEndIdx;
3573 do {
3574 pos = fText->moveIndex32(pos, -1);
3575 thisChar = fText->char32At(pos);
3576 } while (fCM->contains(thisChar));
3577 }
3578 continue;
3579 }
3580 }
3581
3582
3583 // LB 26 Do not break a Korean syllable.
3584 if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3585 fJV->contains(thisChar) ||
3586 fH2->contains(thisChar) ||
3587 fH3->contains(thisChar))) {
3588 continue;
3589 }
3590
3591 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) &&
3592 (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3593 continue;
3594 }
3595
3596 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3597 fJT->contains(thisChar)) {
3598 continue;
3599 }
3600
3601 // LB 27 Treat a Korean Syllable Block the same as ID.
3602 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3603 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3604 fIN->contains(thisChar)) {
3605 continue;
3606 }
3607 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3608 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3609 fPO->contains(thisChar)) {
3610 continue;
3611 }
3612 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3613 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3614 continue;
3615 }
3616
3617
3618
3619 // LB 28 Do not break between alphabetics ("at").
3620 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3621 continue;
3622 }
3623
3624 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g.").
3625 if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3626 continue;
3627 }
3628
3629 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3630 // (AL | NU) x OP
3631 // CP x (AL | NU)
3632 if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
3633 continue;
3634 }
3635 if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3636 continue;
3637 }
3638
3639 // LB30a RI RI <break> RI
3640 // RI x RI
3641 if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
3642 break;
3643 }
3644 if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3645 continue;
3646 }
3647
3648 // LB30b Emoji Base x Emoji Modifier
3649 if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
3650 continue;
3651 }
3652
3653 // LB 31 Break everywhere else
3654 break;
3655
3656 }
3657
3658 return pos;
3659 }
3660
3661
3662 UVector *RBBILineMonkey::charClasses() {
3663 return fSets;
3664 }
3665
3666
3667 RBBILineMonkey::~RBBILineMonkey() {
3668 delete fSets;
3669
3670 delete fBK;
3671 delete fCR;
3672 delete fLF;
3673 delete fCM;
3674 delete fNL;
3675 delete fWJ;
3676 delete fZW;
3677 delete fGL;
3678 delete fCB;
3679 delete fSP;
3680 delete fB2;
3681 delete fBA;
3682 delete fBB;
3683 delete fHY;
3684 delete fH2;
3685 delete fH3;
3686 delete fCL;
3687 delete fCP;
3688 delete fEX;
3689 delete fIN;
3690 delete fJL;
3691 delete fJV;
3692 delete fJT;
3693 delete fNS;
3694 delete fOP;
3695 delete fQU;
3696 delete fIS;
3697 delete fNU;
3698 delete fPO;
3699 delete fPR;
3700 delete fSY;
3701 delete fAI;
3702 delete fAL;
3703 delete fCJ;
3704 delete fHL;
3705 delete fID;
3706 delete fRI;
3707 delete fSG;
3708 delete fXX;
3709 delete fEB;
3710 delete fEM;
3711 delete fZJ;
3712
3713 delete fCharBI;
3714 delete fNumberMatcher;
3715 }
3716
3717
3718 //-------------------------------------------------------------------------------------------
3719 //
3720 // TestMonkey
3721 //
3722 // params
3723 // seed=nnnnn Random number starting seed.
3724 // Setting the seed allows errors to be reproduced.
3725 // loop=nnn Looping count. Controls running time.
3726 // -1: run forever.
3727 // 0 or greater: run length.
3728 //
3729 // type = char | word | line | sent | title
3730 //
3731 // Example:
3732 // intltest rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3733 //
3734 //-------------------------------------------------------------------------------------------
3735
3736 static int32_t getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3737 int32_t val = defaultVal;
3738 name.append(" *= *(-?\\d+)");
3739 UErrorCode status = U_ZERO_ERROR;
3740 RegexMatcher m(name, params, 0, status);
3741 if (m.find()) {
3742 // The param exists. Convert the string to an int.
3743 char valString[100];
3744 int32_t paramLength = m.end(1, status) - m.start(1, status);
3745 if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3746 paramLength = (int32_t)(sizeof(valString)-2);
3747 }
3748 params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3749 val = strtol(valString, NULL, 10);
3750
3751 // Delete this parameter from the params string.
3752 m.reset();
3753 params = m.replaceFirst("", status);
3754 }
3755 U_ASSERT(U_SUCCESS(status));
3756 return val;
3757 }
3758 #endif
3759
3760 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3761 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3762 BreakIterator *bi,
3763 int expected[],
3764 int expectedcount)
3765 {
3766 int count = 0;
3767 int i = 0;
3768 int forward[50];
3769 bi->setText(ustr);
3770 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3771 forward[count] = i;
3772 if (count < expectedcount && expected[count] != i) {
3773 test->errln("break forward test failed: expected %d but got %d",
3774 expected[count], i);
3775 break;
3776 }
3777 count ++;
3778 }
3779 if (count != expectedcount) {
3780 printStringBreaks(ustr, expected, expectedcount);
3781 test->errln("break forward test failed: missed %d match",
3782 expectedcount - count);
3783 return;
3784 }
3785 // testing boundaries
3786 for (i = 1; i < expectedcount; i ++) {
3787 int j = expected[i - 1];
3788 if (!bi->isBoundary(j)) {
3789 printStringBreaks(ustr, expected, expectedcount);
3790 test->errln("isBoundary() failed. Expected boundary at position %d", j);
3791 return;
3792 }
3793 for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3794 if (bi->isBoundary(j)) {
3795 printStringBreaks(ustr, expected, expectedcount);
3796 test->errln("isBoundary() failed. Not expecting boundary at position %d", j);
3797 return;
3798 }
3799 }
3800 }
3801
3802 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3803 count --;
3804 if (forward[count] != i) {
3805 printStringBreaks(ustr, expected, expectedcount);
3806 test->errln("happy break test previous() failed: expected %d but got %d",
3807 forward[count], i);
3808 break;
3809 }
3810 }
3811 if (count != 0) {
3812 printStringBreaks(ustr, expected, expectedcount);
3813 test->errln("break test previous() failed: missed a match");
3814 return;
3815 }
3816
3817 // testing preceding
3818 for (i = 0; i < expectedcount - 1; i ++) {
3819 // int j = expected[i] + 1;
3820 int j = ustr.moveIndex32(expected[i], 1);
3821 for (; j <= expected[i + 1]; j ++) {
3822 if (bi->preceding(j) != expected[i]) {
3823 printStringBreaks(ustr, expected, expectedcount);
3824 test->errln("preceding(): Not expecting boundary at position %d", j);
3825 return;
3826 }
3827 }
3828 }
3829 }
3830 #endif
3831
3832 void RBBITest::TestWordBreaks(void)
3833 {
3834 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3835
3836 Locale locale("en");
3837 UErrorCode status = U_ZERO_ERROR;
3838 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3839 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3840 // Replaced any C+J characters in a row with a random sequence of characters
3841 // of the same length to make our C+J segmentation not get in the way.
3842 static const char *strlist[] =
3843 {
3844 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3845 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3846 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3847 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3848 "\\uac00\\u3588\\u009c\\u0953\\u194b",
3849 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3850 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3851 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3852 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3853 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3854 "\\u2027\\U000e0067\\u0a47\\u00b7",
3855 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3856 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3857 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3858 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3859 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3860 "\\u0027\\u11af\\U000e0057\\u0602",
3861 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3862 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3863 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3864 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3865 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3866 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3867 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3868 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3869 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3870 "\\u18f4\\U000e0049\\u20e7\\u2027",
3871 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3872 "\\ua183\\u102d\\u0bec\\u003a",
3873 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3874 "\\u003a\\u0e57\\u0fad\\u002e",
3875 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3876 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3877 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3878 "\\u003a\\u0664\\u00b7\\u1fba",
3879 "\\u003b\\u0027\\u00b7\\u47a3",
3880 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3881 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3882 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3883 };
3884 int loop;
3885 if (U_FAILURE(status)) {
3886 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3887 return;
3888 }
3889 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3890 // printf("looping %d\n", loop);
3891 UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3892 // RBBICharMonkey monkey;
3893 RBBIWordMonkey monkey;
3894
3895 int expected[50];
3896 int expectedcount = 0;
3897
3898 monkey.setText(ustr);
3899 int i;
3900 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3901 expected[expectedcount ++] = i;
3902 }
3903
3904 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3905 }
3906 delete bi;
3907 #endif
3908 }
3909
3910 void RBBITest::TestWordBoundary(void)
3911 {
3912 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3913 Locale locale("en");
3914 UErrorCode status = U_ZERO_ERROR;
3915 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3916 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3917 UChar str[50];
3918 static const char *strlist[] =
3919 {
3920 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3921 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3922 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3923 "\\u2027\\U000e0067\\u0a47\\u00b7",
3924 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3925 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3926 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3927 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3928 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3929 "\\u0027\\u11af\\U000e0057\\u0602",
3930 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3931 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3932 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3933 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3934 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3935 "\\U000e0065\\u302c\\u09ee\\U000e0068",
3936 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3937 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3938 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3939 "\\u58f4\\U000e0049\\u20e7\\u2027",
3940 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3941 "\\ua183\\u102d\\u0bec\\u003a",
3942 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3943 "\\u003a\\u0e57\\u0fad\\u002e",
3944 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3945 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3946 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3947 "\\u003a\\u0664\\u00b7\\u1fba",
3948 "\\u003b\\u0027\\u00b7\\u47a3",
3949 };
3950 int loop;
3951 if (U_FAILURE(status)) {
3952 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3953 return;
3954 }
3955 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3956 // printf("looping %d\n", loop);
3957 u_unescape(strlist[loop], str, 20);
3958 UnicodeString ustr(str);
3959 int forward[50];
3960 int count = 0;
3961
3962 bi->setText(ustr);
3963 int prev = 0;
3964 int i;
3965 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3966 forward[count ++] = i;
3967 if (i > prev) {
3968 int j;
3969 for (j = prev + 1; j < i; j ++) {
3970 if (bi->isBoundary(j)) {
3971 printStringBreaks(ustr, forward, count);
3972 errln("happy boundary test failed: expected %d not a boundary",
3973 j);
3974 return;
3975 }
3976 }
3977 }
3978 if (!bi->isBoundary(i)) {
3979 printStringBreaks(ustr, forward, count);
3980 errln("happy boundary test failed: expected %d a boundary",
3981 i);
3982 return;
3983 }
3984 prev = i;
3985 }
3986 }
3987 delete bi;
3988 }
3989
3990 void RBBITest::TestLineBreaks(void)
3991 {
3992 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3993 Locale locale("en");
3994 UErrorCode status = U_ZERO_ERROR;
3995 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3996 const int32_t STRSIZE = 50;
3997 UChar str[STRSIZE];
3998 static const char *strlist[] =
3999 {
4000 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
4001 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
4002 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
4003 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
4004 "u2014\\U000e0105\\u118c\\u000a\\u07f8",
4005 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
4006 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
4007 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
4008 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
4009 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
4010 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
4011 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
4012 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
4013 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
4014 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
4015 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
4016 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
4017 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
4018 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
4019 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
4020 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
4021 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
4022 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
4023 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
4024 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
4025 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
4026 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
4027 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
4028 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
4029 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
4030 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
4031 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
4032 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
4033 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
4034 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
4035 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
4036 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
4037 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
4038 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
4039 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
4040 };
4041 int loop;
4042 TEST_ASSERT_SUCCESS(status);
4043 if (U_FAILURE(status)) {
4044 return;
4045 }
4046 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
4047 // printf("looping %d\n", loop);
4048 int32_t t = u_unescape(strlist[loop], str, STRSIZE);
4049 if (t >= STRSIZE) {
4050 TEST_ASSERT(FALSE);
4051 continue;
4052 }
4053
4054
4055 UnicodeString ustr(str);
4056 RBBILineMonkey monkey;
4057 if (U_FAILURE(monkey.deferredStatus)) {
4058 continue;
4059 }
4060
4061 const int EXPECTEDSIZE = 50;
4062 int expected[EXPECTEDSIZE];
4063 int expectedcount = 0;
4064
4065 monkey.setText(ustr);
4066 int i;
4067 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4068 if (expectedcount >= EXPECTEDSIZE) {
4069 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
4070 return;
4071 }
4072 expected[expectedcount ++] = i;
4073 }
4074
4075 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4076 }
4077 delete bi;
4078 #endif
4079 }
4080
4081 void RBBITest::TestSentBreaks(void)
4082 {
4083 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4084 Locale locale("en");
4085 UErrorCode status = U_ZERO_ERROR;
4086 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
4087 UChar str[200];
4088 static const char *strlist[] =
4089 {
4090 "Now\ris\nthe\r\ntime\n\rfor\r\r",
4091 "This\n",
4092 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
4093 "\"Sentence ending with a quote.\" Bye.",
4094 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
4095 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
4096 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
4097 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
4098 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
4099 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
4100 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
4101 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
4102 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
4103 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
4104 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
4105 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
4106 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
4107 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
4108 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
4109 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
4110 };
4111 int loop;
4112 if (U_FAILURE(status)) {
4113 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
4114 return;
4115 }
4116 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
4117 u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
4118 UnicodeString ustr(str);
4119
4120 RBBISentMonkey monkey;
4121 if (U_FAILURE(monkey.deferredStatus)) {
4122 continue;
4123 }
4124
4125 const int EXPECTEDSIZE = 50;
4126 int expected[EXPECTEDSIZE];
4127 int expectedcount = 0;
4128
4129 monkey.setText(ustr);
4130 int i;
4131 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4132 if (expectedcount >= EXPECTEDSIZE) {
4133 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
4134 return;
4135 }
4136 expected[expectedcount ++] = i;
4137 }
4138
4139 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4140 }
4141 delete bi;
4142 #endif
4143 }
4144
4145 void RBBITest::TestMonkey(char *params) {
4146 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4147
4148 UErrorCode status = U_ZERO_ERROR;
4149 int32_t loopCount = 500;
4150 int32_t seed = 1;
4151 UnicodeString breakType = "all";
4152 Locale locale("en");
4153 UBool useUText = FALSE;
4154
4155 if (quick == FALSE) {
4156 loopCount = 10000;
4157 }
4158
4159 if (params) {
4160 UnicodeString p(params);
4161 loopCount = getIntParam("loop", p, loopCount);
4162 seed = getIntParam("seed", p, seed);
4163
4164 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
4165 if (m.find()) {
4166 breakType = m.group(1, status);
4167 m.reset();
4168 p = m.replaceFirst("", status);
4169 }
4170
4171 RegexMatcher u(" *utext", p, 0, status);
4172 if (u.find()) {
4173 useUText = TRUE;
4174 u.reset();
4175 p = u.replaceFirst("", status);
4176 }
4177
4178
4179 // m.reset(p);
4180 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
4181 // Each option is stripped out of the option string as it is processed.
4182 // All options have been checked. The option string should have been completely emptied..
4183 char buf[100];
4184 p.extract(buf, sizeof(buf), NULL, status);
4185 buf[sizeof(buf)-1] = 0;
4186 errln("Unrecognized or extra parameter: %s\n", buf);
4187 return;
4188 }
4189
4190 }
4191
4192 if (breakType == "char" || breakType == "all") {
4193 RBBICharMonkey m;
4194 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
4195 if (U_SUCCESS(status)) {
4196 RunMonkey(bi, m, "char", seed, loopCount, useUText);
4197 if (breakType == "all" && useUText==FALSE) {
4198 // Also run a quick test with UText when "all" is specified
4199 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
4200 }
4201 }
4202 else {
4203 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
4204 }
4205 delete bi;
4206 }
4207
4208 if (breakType == "word" || breakType == "all") {
4209 logln("Word Break Monkey Test");
4210 RBBIWordMonkey m;
4211 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
4212 if (U_SUCCESS(status)) {
4213 RunMonkey(bi, m, "word", seed, loopCount, useUText);
4214 }
4215 else {
4216 errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
4217 }
4218 delete bi;
4219 }
4220
4221 if (breakType == "line" || breakType == "all") {
4222 logln("Line Break Monkey Test");
4223 RBBILineMonkey m;
4224 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
4225 if (loopCount >= 10) {
4226 loopCount = loopCount / 5; // Line break runs slower than the others.
4227 }
4228 if (U_SUCCESS(status)) {
4229 RunMonkey(bi, m, "line", seed, loopCount, useUText);
4230 }
4231 else {
4232 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4233 }
4234 delete bi;
4235 }
4236
4237 if (breakType == "sent" || breakType == "all" ) {
4238 logln("Sentence Break Monkey Test");
4239 RBBISentMonkey m;
4240 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
4241 if (loopCount >= 10) {
4242 loopCount = loopCount / 10; // Sentence runs slower than the other break types
4243 }
4244 if (U_SUCCESS(status)) {
4245 RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
4246 }
4247 else {
4248 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4249 }
4250 delete bi;
4251 }
4252
4253 #endif
4254 }
4255
4256 //
4257 // Run a RBBI monkey test. Common routine, for all break iterator types.
4258 // Parameters:
4259 // bi - the break iterator to use
4260 // mk - MonkeyKind, abstraction for obtaining expected results
4261 // name - Name of test (char, word, etc.) for use in error messages
4262 // seed - Seed for starting random number generator (parameter from user)
4263 // numIterations
4264 //
4265 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed,
4266 int32_t numIterations, UBool useUText) {
4267
4268 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4269
4270 const int32_t TESTSTRINGLEN = 500;
4271 UnicodeString testText;
4272 int32_t numCharClasses;
4273 UVector *chClasses;
4274 int expected[TESTSTRINGLEN*2 + 1];
4275 int expectedCount = 0;
4276 char expectedBreaks[TESTSTRINGLEN*2 + 1];
4277 char forwardBreaks[TESTSTRINGLEN*2 + 1];
4278 char reverseBreaks[TESTSTRINGLEN*2+1];
4279 char isBoundaryBreaks[TESTSTRINGLEN*2+1];
4280 char followingBreaks[TESTSTRINGLEN*2+1];
4281 char precedingBreaks[TESTSTRINGLEN*2+1];
4282 int i;
4283 int loopCount = 0;
4284
4285 m_seed = seed;
4286
4287 numCharClasses = mk.charClasses()->size();
4288 chClasses = mk.charClasses();
4289
4290 // Check for errors that occured during the construction of the MonkeyKind object.
4291 // Can't report them where they occured because errln() is a method coming from intlTest,
4292 // and is not visible outside of RBBITest :-(
4293 if (U_FAILURE(mk.deferredStatus)) {
4294 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
4295 return;
4296 }
4297
4298 // Verify that the character classes all have at least one member.
4299 for (i=0; i<numCharClasses; i++) {
4300 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
4301 if (s == NULL || s->size() == 0) {
4302 errln("Character Class #%d is null or of zero size.", i);
4303 return;
4304 }
4305 }
4306
4307 while (loopCount < numIterations || numIterations == -1) {
4308 if (numIterations == -1 && loopCount % 10 == 0) {
4309 // If test is running in an infinite loop, display a periodic tic so
4310 // we can tell that it is making progress.
4311 fprintf(stderr, ".");
4312 }
4313 // Save current random number seed, so that we can recreate the random numbers
4314 // for this loop iteration in event of an error.
4315 seed = m_seed;
4316
4317 // Populate a test string with data.
4318 testText.truncate(0);
4319 for (i=0; i<TESTSTRINGLEN; i++) {
4320 int32_t aClassNum = m_rand() % numCharClasses;
4321 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4322 int32_t charIdx = m_rand() % classSet->size();
4323 UChar32 c = classSet->charAt(charIdx);
4324 if (c < 0) { // TODO: deal with sets containing strings.
4325 errln("%s:%d c < 0", __FILE__, __LINE__);
4326 break;
4327 }
4328 // Do not assemble a supplementary character from randomly generated separate surrogates.
4329 // (It could be a dictionary character)
4330 if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
4331 continue;
4332 }
4333
4334 testText.append(c);
4335 }
4336
4337 // Calculate the expected results for this test string.
4338 mk.setText(testText);
4339 memset(expectedBreaks, 0, sizeof(expectedBreaks));
4340 expectedBreaks[0] = 1;
4341 int32_t breakPos = 0;
4342 expectedCount = 0;
4343 for (;;) {
4344 breakPos = mk.next(breakPos);
4345 if (breakPos == -1) {
4346 break;
4347 }
4348 if (breakPos > testText.length()) {
4349 errln("breakPos > testText.length()");
4350 }
4351 expectedBreaks[breakPos] = 1;
4352 U_ASSERT(expectedCount<testText.length());
4353 expected[expectedCount ++] = breakPos;
4354 (void)expected; // Set but not used warning.
4355 // TODO (andy): check it out.
4356 }
4357
4358 // Find the break positions using forward iteration
4359 memset(forwardBreaks, 0, sizeof(forwardBreaks));
4360 if (useUText) {
4361 UErrorCode status = U_ZERO_ERROR;
4362 UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4363 // testUText = utext_openUnicodeString(testUText, &testText, &status);
4364 bi->setText(testUText, status);
4365 TEST_ASSERT_SUCCESS(status);
4366 utext_close(testUText); // The break iterator does a shallow clone of the UText
4367 // This UText can be closed immediately, so long as the
4368 // testText string continues to exist.
4369 } else {
4370 bi->setText(testText);
4371 }
4372
4373 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4374 if (i < 0 || i > testText.length()) {
4375 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4376 break;
4377 }
4378 forwardBreaks[i] = 1;
4379 }
4380
4381 // Find the break positions using reverse iteration
4382 memset(reverseBreaks, 0, sizeof(reverseBreaks));
4383 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4384 if (i < 0 || i > testText.length()) {
4385 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4386 break;
4387 }
4388 reverseBreaks[i] = 1;
4389 }
4390
4391 // Find the break positions using isBoundary() tests.
4392 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4393 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4394 for (i=0; i<=testText.length(); i++) {
4395 isBoundaryBreaks[i] = bi->isBoundary(i);
4396 }
4397
4398
4399 // Find the break positions using the following() function.
4400 // printf(".");
4401 memset(followingBreaks, 0, sizeof(followingBreaks));
4402 int32_t lastBreakPos = 0;
4403 followingBreaks[0] = 1;
4404 for (i=0; i<testText.length(); i++) {
4405 breakPos = bi->following(i);
4406 if (breakPos <= i ||
4407 breakPos < lastBreakPos ||
4408 breakPos > testText.length() ||
4409 (breakPos > lastBreakPos && lastBreakPos > i)) {
4410 errln("%s break monkey test: "
4411 "Out of range value returned by BreakIterator::following().\n"
4412 "Random seed=%d index=%d; following returned %d; lastbreak=%d",
4413 name, seed, i, breakPos, lastBreakPos);
4414 break;
4415 }
4416 followingBreaks[breakPos] = 1;
4417 lastBreakPos = breakPos;
4418 }
4419
4420 // Find the break positions using the preceding() function.
4421 memset(precedingBreaks, 0, sizeof(precedingBreaks));
4422 lastBreakPos = testText.length();
4423 precedingBreaks[testText.length()] = 1;
4424 for (i=testText.length(); i>0; i--) {
4425 breakPos = bi->preceding(i);
4426 if (breakPos >= i ||
4427 breakPos > lastBreakPos ||
4428 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4429 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4430 errln("%s break monkey test: "
4431 "Out of range value returned by BreakIterator::preceding().\n"
4432 "index=%d; prev returned %d; lastBreak=%d" ,
4433 name, i, breakPos, lastBreakPos);
4434 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4435 precedingBreaks[i] = 2; // Forces an error.
4436 }
4437 } else {
4438 if (breakPos >= 0) {
4439 precedingBreaks[breakPos] = 1;
4440 }
4441 lastBreakPos = breakPos;
4442 }
4443 }
4444
4445 // Compare the expected and actual results.
4446 for (i=0; i<=testText.length(); i++) {
4447 const char *errorType = NULL;
4448 if (forwardBreaks[i] != expectedBreaks[i]) {
4449 errorType = "next()";
4450 } else if (reverseBreaks[i] != forwardBreaks[i]) {
4451 errorType = "previous()";
4452 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4453 errorType = "isBoundary()";
4454 } else if (followingBreaks[i] != expectedBreaks[i]) {
4455 errorType = "following()";
4456 } else if (precedingBreaks[i] != expectedBreaks[i]) {
4457 errorType = "preceding()";
4458 }
4459
4460
4461 if (errorType != NULL) {
4462 // Format a range of the test text that includes the failure as
4463 // a data item that can be included in the rbbi test data file.
4464
4465 // Start of the range is the last point where expected and actual results
4466 // both agreed that there was a break position.
4467 int startContext = i;
4468 int32_t count = 0;
4469 for (;;) {
4470 if (startContext==0) { break; }
4471 startContext --;
4472 if (expectedBreaks[startContext] != 0) {
4473 if (count == 2) break;
4474 count ++;
4475 }
4476 }
4477
4478 // End of range is two expected breaks past the start position.
4479 int endContext = i + 1;
4480 int ci;
4481 for (ci=0; ci<2; ci++) { // Number of items to include in error text.
4482 for (;;) {
4483 if (endContext >= testText.length()) {break;}
4484 if (expectedBreaks[endContext-1] != 0) {
4485 if (count == 0) break;
4486 count --;
4487 }
4488 endContext ++;
4489 }
4490 }
4491
4492 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4493 UnicodeString errorText = "<data>";
4494 /***if (strcmp(errorType, "next()") == 0) {
4495 startContext = 0;
4496 endContext = testText.length();
4497
4498 printStringBreaks(testText, expected, expectedCount);
4499 }***/
4500
4501 for (ci=startContext; ci<endContext;) {
4502 UnicodeString hexChars("0123456789abcdef");
4503 UChar32 c;
4504 int bn;
4505 c = testText.char32At(ci);
4506 if (ci == i) {
4507 // This is the location of the error.
4508 errorText.append("<?>");
4509 } else if (expectedBreaks[ci] != 0) {
4510 // This a non-error expected break position.
4511 errorText.append("\\");
4512 }
4513 if (c < 0x10000) {
4514 errorText.append("\\u");
4515 for (bn=12; bn>=0; bn-=4) {
4516 errorText.append(hexChars.charAt((c>>bn)&0xf));
4517 }
4518 } else {
4519 errorText.append("\\U");
4520 for (bn=28; bn>=0; bn-=4) {
4521 errorText.append(hexChars.charAt((c>>bn)&0xf));
4522 }
4523 }
4524 ci = testText.moveIndex32(ci, 1);
4525 }
4526 errorText.append("\\");
4527 errorText.append("</data>\n");
4528
4529 // Output the error
4530 char charErrorTxt[500];
4531 UErrorCode status = U_ZERO_ERROR;
4532 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4533 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4534 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
4535
4536 errln("%s break monkey test error [%s]. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s",
4537 name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
4538 errorType, seed, i, charErrorTxt);
4539 break;
4540 }
4541 }
4542
4543 loopCount++;
4544 }
4545 #endif
4546 }
4547
4548
4549 // Bug 5532. UTF-8 based UText fails in dictionary code.
4550 // This test checks the initial patch,
4551 // which is to just keep it from crashing. Correct word boundaries
4552 // await a proper fix to the dictionary code.
4553 //
4554 void RBBITest::TestBug5532(void) {
4555 // Text includes a mixture of Thai and Latin.
4556 const unsigned char utf8Data[] = {
4557 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4558 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4559 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4560 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4561 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4562 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4563 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4564 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4565 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4566 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4567 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4568
4569 UErrorCode status = U_ZERO_ERROR;
4570 UText utext=UTEXT_INITIALIZER;
4571 utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4572 TEST_ASSERT_SUCCESS(status);
4573
4574 BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4575 TEST_ASSERT_SUCCESS(status);
4576 if (U_SUCCESS(status)) {
4577 bi->setText(&utext, status);
4578 TEST_ASSERT_SUCCESS(status);
4579
4580 int32_t breakCount = 0;
4581 int32_t previousBreak = -1;
4582 for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4583 // For now, just make sure that the break iterator doesn't hang.
4584 TEST_ASSERT(previousBreak < bi->current());
4585 previousBreak = bi->current();
4586 }
4587 TEST_ASSERT(breakCount > 0);
4588 }
4589 delete bi;
4590 utext_close(&utext);
4591 }
4592
4593
4594 void RBBITest::TestBug9983(void) {
4595 UnicodeString text = UnicodeString("\\u002A" // * Other
4596 "\\uFF65" // Other
4597 "\\u309C" // Katakana
4598 "\\uFF9F" // Extend
4599 "\\uFF65" // Other
4600 "\\u0020" // Other
4601 "\\u0000").unescape();
4602
4603 UErrorCode status = U_ZERO_ERROR;
4604 LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4605 BreakIterator::createWordInstance(Locale::getRoot(), status)));
4606 TEST_ASSERT_SUCCESS(status);
4607 LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4608 BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4609 TEST_ASSERT_SUCCESS(status);
4610 if (U_FAILURE(status)) {
4611 return;
4612 }
4613 int32_t offset, rstatus, iterationCount;
4614
4615 brkiter->setText(text);
4616 brkiter->last();
4617 iterationCount = 0;
4618 while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4619 iterationCount++;
4620 rstatus = brkiter->getRuleStatus();
4621 (void)rstatus; // Suppress set but not used warning.
4622 if (iterationCount >= 10) {
4623 break;
4624 }
4625 }
4626 TEST_ASSERT(iterationCount == 6);
4627
4628 brkiterPOSIX->setText(text);
4629 brkiterPOSIX->last();
4630 iterationCount = 0;
4631 while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4632 iterationCount++;
4633 rstatus = brkiterPOSIX->getRuleStatus();
4634 (void)rstatus; // Suppress set but not used warning.
4635 if (iterationCount >= 10) {
4636 break;
4637 }
4638 }
4639 TEST_ASSERT(iterationCount == 6);
4640 }
4641
4642
4643 //
4644 // TestDebug - A place-holder test for debugging purposes.
4645 // For putting in fragments of other tests that can be invoked
4646 // for tracing without a lot of unwanted extra stuff happening.
4647 //
4648 void RBBITest::TestDebug(void) {
4649 #if 0
4650 UErrorCode status = U_ZERO_ERROR;
4651 int pos = 0;
4652 int ruleStatus = 0;
4653
4654 RuleBasedBreakIterator* bi =
4655 // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
4656 // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
4657 (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
4658 UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
4659 // UnicodeString s("Aaa. Bcd");
4660 s = s.unescape();
4661 bi->setText(s);
4662 UBool r = bi->isBoundary(8);
4663 printf("%s", r?"true":"false");
4664 return;
4665 pos = bi->last();
4666 do {
4667 // ruleStatus = bi->getRuleStatus();
4668 printf("%d\t%d\n", pos, ruleStatus);
4669 pos = bi->previous();
4670 } while (pos != BreakIterator::DONE);
4671 #endif
4672 }
4673
4674 void RBBITest::TestProperties() {
4675 UErrorCode errorCode = U_ZERO_ERROR;
4676 UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4677 if (!prependSet.isEmpty()) {
4678 errln(
4679 "[:GCB=Prepend:] is not empty any more. "
4680 "Uncomment relevant lines in source/data/brkitr/char.txt and "
4681 "change this test to the opposite condition.");
4682 }
4683 }
4684
4685 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */