]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/intltest/rbbitst.cpp
ICU-62123.0.1.tar.gz
[apple/icu.git] / icuSources / test / intltest / rbbitst.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * COPYRIGHT:
5 * Copyright (c) 1999-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
8 /************************************************************************
9 * Date Name Description
10 * 12/15/99 Madhu Creation.
11 * 01/12/2000 Madhu Updated for changed API and added new tests
12 ************************************************************************/
13
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_BREAK_ITERATION
16
17 #include <stdio.h>
18 #include <stdlib.h>
19 #include <string.h>
20 #include <utility>
21 #include <vector>
22
23 #include "unicode/brkiter.h"
24 #include "unicode/localpointer.h"
25 #include "unicode/numfmt.h"
26 #include "unicode/rbbi.h"
27 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
28 #include "unicode/regex.h"
29 #endif
30 #include "unicode/schriter.h"
31 #include "unicode/uchar.h"
32 #include "unicode/utf16.h"
33 #include "unicode/ucnv.h"
34 #include "unicode/uniset.h"
35 #include "unicode/uscript.h"
36 #include "unicode/ustring.h"
37 #include "unicode/utext.h"
38
39 #include "charstr.h"
40 #include "cmemory.h"
41 #include "cstr.h"
42 #include "intltest.h"
43 #include "rbbitst.h"
44 #include "rbbidata.h"
45 #include "utypeinfo.h" // for 'typeid' to work
46 #include "uvector.h"
47 #include "uvectr32.h"
48
49
50 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
51 #include "unicode/filteredbrk.h"
52 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
53
54 #define TEST_ASSERT(x) {if (!(x)) { \
55 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
56
57 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
58 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
59
60 //---------------------------------------------
61 // runIndexedTest
62 //---------------------------------------------
63
64
65 // Note: Before adding new tests to this file, check whether the desired test data can
66 // simply be added to the file testdata/rbbitest.txt. In most cases it can,
67 // it's much less work than writing a new test, diagnostic output in the event of failures
68 // is good, and the test data file will is shared with ICU4J, so eventually the test
69 // will run there as well, without additional effort.
70
71 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
72 {
73 if (exec) logln("TestSuite RuleBasedBreakIterator: ");
74 fTestParams = params;
75
76 TESTCASE_AUTO_BEGIN;
77 #if !UCONFIG_NO_FILE_IO
78 TESTCASE_AUTO(TestBug4153072);
79 #endif
80 #if !UCONFIG_NO_FILE_IO
81 TESTCASE_AUTO(TestUnicodeFiles);
82 #endif
83 TESTCASE_AUTO(TestGetAvailableLocales);
84 TESTCASE_AUTO(TestGetDisplayName);
85 #if !UCONFIG_NO_FILE_IO
86 TESTCASE_AUTO(TestEndBehaviour);
87 TESTCASE_AUTO(TestWordBreaks);
88 TESTCASE_AUTO(TestWordBoundary);
89 TESTCASE_AUTO(TestLineBreaks);
90 TESTCASE_AUTO(TestSentBreaks);
91 TESTCASE_AUTO(TestExtended);
92 #endif
93 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
94 TESTCASE_AUTO(TestMonkey);
95 #endif
96 #if !UCONFIG_NO_FILE_IO
97 TESTCASE_AUTO(TestBug3818);
98 #endif
99 TESTCASE_AUTO(TestDebug);
100 #if !UCONFIG_NO_FILE_IO
101 TESTCASE_AUTO(TestBug5775);
102 #endif
103 TESTCASE_AUTO(TestBug9983);
104 TESTCASE_AUTO(TestDictRules);
105 TESTCASE_AUTO(TestBug5532);
106 TESTCASE_AUTO(TestBug7547);
107 TESTCASE_AUTO(TestBug12797);
108 TESTCASE_AUTO(TestBug12918);
109 TESTCASE_AUTO(TestBug12932);
110 TESTCASE_AUTO(TestEmoji);
111 TESTCASE_AUTO(TestBug12519);
112 TESTCASE_AUTO(TestBug12677);
113 TESTCASE_AUTO(TestTableRedundancies);
114 TESTCASE_AUTO(TestBug13447);
115 TESTCASE_AUTO(TestReverse);
116 TESTCASE_AUTO(TestBug13692);
117 TESTCASE_AUTO_END;
118 }
119
120
121 //--------------------------------------------------------------------------------------
122 //
123 // RBBITest constructor and destructor
124 //
125 //--------------------------------------------------------------------------------------
126
127 RBBITest::RBBITest() {
128 fTestParams = NULL;
129 }
130
131
132 RBBITest::~RBBITest() {
133 }
134
135
136 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
137 UErrorCode status = U_ZERO_ERROR;
138 char name[100];
139 printf("code alpha extend alphanum type word sent line name\n");
140 int nextExpectedIndex = 0;
141 utext_setNativeIndex(tstr, 0);
142 for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) {
143 if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
144 printf("------------------------------------------------ %d\n", j);
145 ++nextExpectedIndex;
146 }
147
148 UChar32 c = utext_next32(tstr);
149 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
150 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
151 u_isUAlphabetic(c),
152 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
153 u_isalnum(c),
154 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
155 u_charType(c),
156 U_SHORT_PROPERTY_NAME),
157 u_getPropertyValueName(UCHAR_WORD_BREAK,
158 u_getIntPropertyValue(c,
159 UCHAR_WORD_BREAK),
160 U_SHORT_PROPERTY_NAME),
161 u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
162 u_getIntPropertyValue(c,
163 UCHAR_SENTENCE_BREAK),
164 U_SHORT_PROPERTY_NAME),
165 u_getPropertyValueName(UCHAR_LINE_BREAK,
166 u_getIntPropertyValue(c,
167 UCHAR_LINE_BREAK),
168 U_SHORT_PROPERTY_NAME),
169 name);
170 }
171 }
172
173
174 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
175 UErrorCode status = U_ZERO_ERROR;
176 UText *tstr = NULL;
177 tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
178 if (U_FAILURE(status)) {
179 printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
180 return;
181 }
182 printStringBreaks(tstr, expected, expectedCount);
183 utext_close(tstr);
184 }
185
186
187 void RBBITest::TestBug3818() {
188 UErrorCode status = U_ZERO_ERROR;
189
190 // Four Thai words...
191 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
192 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
193 UnicodeString thaiStr(thaiWordData);
194
195 BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
196 if (U_FAILURE(status) || bi == NULL) {
197 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
198 return;
199 }
200 bi->setText(thaiStr);
201
202 int32_t startOfSecondWord = bi->following(1);
203 if (startOfSecondWord != 4) {
204 errln("Fail at file %s, line %d expected start of word at 4, got %d",
205 __FILE__, __LINE__, startOfSecondWord);
206 }
207 startOfSecondWord = bi->following(0);
208 if (startOfSecondWord != 4) {
209 errln("Fail at file %s, line %d expected start of word at 4, got %d",
210 __FILE__, __LINE__, startOfSecondWord);
211 }
212 delete bi;
213 }
214
215
216 //---------------------------------------------
217 //
218 // other tests
219 //
220 //---------------------------------------------
221
222 void RBBITest::TestGetAvailableLocales()
223 {
224 int32_t locCount = 0;
225 const Locale* locList = BreakIterator::getAvailableLocales(locCount);
226
227 if (locCount == 0)
228 dataerrln("getAvailableLocales() returned an empty list!");
229 // Just make sure that it's returning good memory.
230 int32_t i;
231 for (i = 0; i < locCount; ++i) {
232 logln(locList[i].getName());
233 }
234 }
235
236 //Testing the BreakIterator::getDisplayName() function
237 void RBBITest::TestGetDisplayName()
238 {
239 UnicodeString result;
240
241 BreakIterator::getDisplayName(Locale::getUS(), result);
242 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
243 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
244 + result);
245
246 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
247 if (result != "French (France)")
248 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
249 + result);
250 }
251 /**
252 * Test End Behaviour
253 * @bug 4068137
254 */
255 void RBBITest::TestEndBehaviour()
256 {
257 UErrorCode status = U_ZERO_ERROR;
258 UnicodeString testString("boo.");
259 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
260 if (U_FAILURE(status))
261 {
262 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
263 return;
264 }
265 wb->setText(testString);
266
267 if (wb->first() != 0)
268 errln("Didn't get break at beginning of string.");
269 if (wb->next() != 3)
270 errln("Didn't get break before period in \"boo.\"");
271 if (wb->current() != 4 && wb->next() != 4)
272 errln("Didn't get break at end of string.");
273 delete wb;
274 }
275 /*
276 * @bug 4153072
277 */
278 void RBBITest::TestBug4153072() {
279 UErrorCode status = U_ZERO_ERROR;
280 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
281 if (U_FAILURE(status))
282 {
283 errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
284 return;
285 }
286 UnicodeString str("...Hello, World!...");
287 int32_t begin = 3;
288 int32_t end = str.length() - 3;
289 UBool onBoundary;
290
291 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
292 iter->adoptText(textIterator);
293 int index;
294 // Note: with the switch to UText, there is no way to restrict the
295 // iteration range to begin at an index other than zero.
296 // String character iterators created with a non-zero bound are
297 // treated by RBBI as being empty.
298 for (index = -1; index < begin + 1; ++index) {
299 onBoundary = iter->isBoundary(index);
300 if (index == 0? !onBoundary : onBoundary) {
301 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
302 " and begin index = " + begin);
303 }
304 }
305 delete iter;
306 }
307
308
309 //
310 // Test for problem reported by Ashok Matoria on 9 July 2007
311 // One.<kSoftHyphen><kSpace>Two.
312 //
313 // Sentence break at start (0) and then on calling next() it breaks at
314 // 'T' of "Two". Now, at this point if I do next() and
315 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
316 //
317 void RBBITest::TestBug5775() {
318 UErrorCode status = U_ZERO_ERROR;
319 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
320 TEST_ASSERT_SUCCESS(status);
321 if (U_FAILURE(status)) {
322 return;
323 }
324 // Check for status first for better handling of no data errors.
325 TEST_ASSERT(bi != NULL);
326 if (bi == NULL) {
327 return;
328 }
329
330 UnicodeString s("One.\\u00ad Two.", -1, US_INV);
331 // 01234 56789
332 s = s.unescape();
333 bi->setText(s);
334 int pos = bi->next();
335 TEST_ASSERT(pos == 6);
336 pos = bi->next();
337 TEST_ASSERT(pos == 10);
338 pos = bi->previous();
339 TEST_ASSERT(pos == 6);
340 delete bi;
341 }
342
343
344
345 //------------------------------------------------------------------------------
346 //
347 // RBBITest::Extended Run RBBI Tests from an external test data file
348 //
349 //------------------------------------------------------------------------------
350
351 struct TestParams {
352 BreakIterator *bi; // Break iterator is set while parsing test source.
353 // Changed out whenever test data changes break type.
354
355 UnicodeString dataToBreak; // Data that is built up while parsing the test.
356 UVector32 *expectedBreaks; // Expected break positions, matches dataToBreak UnicodeString.
357 UVector32 *srcLine; // Positions in source file, indexed same as dataToBreak.
358 UVector32 *srcCol;
359
360 UText *textToBreak; // UText, could be UTF8 or UTF16.
361 UVector32 *textMap; // Map from UTF-16 dataToBreak offsets to UText offsets.
362 CharString utf8String; // UTF-8 form of text to break.
363
364 TestParams(UErrorCode &status) : dataToBreak() {
365 bi = NULL;
366 expectedBreaks = new UVector32(status);
367 srcLine = new UVector32(status);
368 srcCol = new UVector32(status);
369 textToBreak = NULL;
370 textMap = new UVector32(status);
371 }
372
373 ~TestParams() {
374 delete bi;
375 delete expectedBreaks;
376 delete srcLine;
377 delete srcCol;
378 utext_close(textToBreak);
379 delete textMap;
380 }
381
382 int32_t getSrcLine(int32_t bp);
383 int32_t getExpectedBreak(int32_t bp);
384 int32_t getSrcCol(int32_t bp);
385
386 void setUTF16(UErrorCode &status);
387 void setUTF8(UErrorCode &status);
388 };
389
390 // Append a UnicodeString to a CharString with UTF-8 encoding.
391 // Substitute any invalid chars.
392 // Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
393 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
394 if (U_FAILURE(status)) {
395 return;
396 }
397 int32_t utf8Length;
398 u_strToUTF8WithSub(NULL, 0, &utf8Length, // Output Buffer, NULL for preflight.
399 src.getBuffer(), src.length(), // UTF-16 data
400 0xfffd, NULL, // Substitution char, number of subs.
401 &status);
402 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
403 return;
404 }
405 status = U_ZERO_ERROR;
406 int32_t capacity;
407 char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
408 u_strToUTF8WithSub(buffer, utf8Length, NULL,
409 src.getBuffer(), src.length(),
410 0xfffd, NULL, &status);
411 dest.append(buffer, utf8Length, status);
412 }
413
414
415 void TestParams::setUTF16(UErrorCode &status) {
416 textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
417 textMap->removeAllElements();
418 for (int32_t i=0; i<dataToBreak.length(); i++) {
419 if (i == dataToBreak.getChar32Start(i)) {
420 textMap->addElement(i, status);
421 } else {
422 textMap->addElement(-1, status);
423 }
424 }
425 textMap->addElement(dataToBreak.length(), status);
426 U_ASSERT(dataToBreak.length() + 1 == textMap->size());
427 }
428
429
430 void TestParams::setUTF8(UErrorCode &status) {
431 if (U_FAILURE(status)) {
432 return;
433 }
434 utf8String.clear();
435 CharStringAppend(utf8String, dataToBreak, status);
436 textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
437 if (U_FAILURE(status)) {
438 return;
439 }
440
441 textMap->removeAllElements();
442 int32_t utf16Index = 0;
443 for (;;) {
444 textMap->addElement(utf16Index, status);
445 UChar32 c32 = utext_current32(textToBreak);
446 if (c32 < 0) {
447 break;
448 }
449 utf16Index += U16_LENGTH(c32);
450 utext_next32(textToBreak);
451 while (textMap->size() < utext_getNativeIndex(textToBreak)) {
452 textMap->addElement(-1, status);
453 }
454 }
455 U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
456 }
457
458
459 int32_t TestParams::getSrcLine(int32_t bp) {
460 if (bp >= textMap->size()) {
461 bp = textMap->size() - 1;
462 }
463 int32_t i = 0;
464 for(; bp >= 0 ; --bp) {
465 // Move to a character boundary if we are not on one already.
466 i = textMap->elementAti(bp);
467 if (i >= 0) {
468 break;
469 }
470 }
471 return srcLine->elementAti(i);
472 }
473
474
475 int32_t TestParams::getExpectedBreak(int32_t bp) {
476 if (bp >= textMap->size()) {
477 return 0;
478 }
479 int32_t i = textMap->elementAti(bp);
480 int32_t retVal = 0;
481 if (i >= 0) {
482 retVal = expectedBreaks->elementAti(i);
483 }
484 return retVal;
485 }
486
487
488 int32_t TestParams::getSrcCol(int32_t bp) {
489 if (bp >= textMap->size()) {
490 bp = textMap->size() - 1;
491 }
492 int32_t i = 0;
493 for(; bp >= 0; --bp) {
494 // Move bp to a character boundary if we are not on one already.
495 i = textMap->elementAti(bp);
496 if (i >= 0) {
497 break;
498 }
499 }
500 return srcCol->elementAti(i);
501 }
502
503
504 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
505 int32_t bp;
506 int32_t prevBP;
507 int32_t i;
508
509 TEST_ASSERT_SUCCESS(status);
510 if (U_FAILURE(status)) {
511 return;
512 }
513
514 if (t->bi == NULL) {
515 return;
516 }
517
518 t->bi->setText(t->textToBreak, status);
519 //
520 // Run the iterator forward
521 //
522 prevBP = -1;
523 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
524 if (prevBP == bp) {
525 // Fail for lack of forward progress.
526 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
527 bp, t->getSrcLine(bp), t->getSrcCol(bp));
528 break;
529 }
530
531 // Check that there we didn't miss an expected break between the last one
532 // and this one.
533 for (i=prevBP+1; i<bp; i++) {
534 if (t->getExpectedBreak(i) != 0) {
535 int expected[] = {0, i};
536 printStringBreaks(t->dataToBreak, expected, 2);
537 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
538 i, t->getSrcLine(i), t->getSrcCol(i));
539 }
540 }
541
542 // Check that the break we did find was expected
543 if (t->getExpectedBreak(bp) == 0) {
544 int expected[] = {0, bp};
545 printStringBreaks(t->textToBreak, expected, 2);
546 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
547 bp, t->getSrcLine(bp), t->getSrcCol(bp));
548 } else {
549 // The break was expected.
550 // Check that the {nnn} tag value is correct.
551 int32_t expectedTagVal = t->getExpectedBreak(bp);
552 if (expectedTagVal == -1) {
553 expectedTagVal = 0;
554 }
555 int32_t line = t->getSrcLine(bp);
556 int32_t rs = t->bi->getRuleStatus();
557 if (rs != expectedTagVal) {
558 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
559 " Actual, Expected status = %4d, %4d",
560 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
561 }
562 }
563
564 prevBP = bp;
565 }
566
567 // Verify that there were no missed expected breaks after the last one found
568 for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
569 if (t->getExpectedBreak(i) != 0) {
570 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
571 i, t->getSrcLine(i), t->getSrcCol(i));
572 }
573 }
574
575 //
576 // Run the iterator backwards, verify that the same breaks are found.
577 //
578 prevBP = utext_nativeLength(t->textToBreak)+2; // start with a phony value for the last break pos seen.
579 bp = t->bi->last();
580 while (bp != BreakIterator::DONE) {
581 if (prevBP == bp) {
582 // Fail for lack of progress.
583 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
584 bp, t->getSrcLine(bp), t->getSrcCol(bp));
585 break;
586 }
587
588 // Check that we didn't miss an expected break between the last one
589 // and this one. (UVector returns zeros for index out of bounds.)
590 for (i=prevBP-1; i>bp; i--) {
591 if (t->getExpectedBreak(i) != 0) {
592 errln("Reverse Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
593 i, t->getSrcLine(i), t->getSrcCol(i));
594 }
595 }
596
597 // Check that the break we did find was expected
598 if (t->getExpectedBreak(bp) == 0) {
599 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
600 bp, t->getSrcLine(bp), t->getSrcCol(bp));
601 } else {
602 // The break was expected.
603 // Check that the {nnn} tag value is correct.
604 int32_t expectedTagVal = t->getExpectedBreak(bp);
605 if (expectedTagVal == -1) {
606 expectedTagVal = 0;
607 }
608 int line = t->getSrcLine(bp);
609 int32_t rs = t->bi->getRuleStatus();
610 if (rs != expectedTagVal) {
611 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
612 " Actual, Expected status = %4d, %4d",
613 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
614 }
615 }
616
617 prevBP = bp;
618 bp = t->bi->previous();
619 }
620
621 // Verify that there were no missed breaks prior to the last one found
622 for (i=prevBP-1; i>=0; i--) {
623 if (t->getExpectedBreak(i) != 0) {
624 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
625 i, t->getSrcLine(i), t->getSrcCol(i));
626 }
627 }
628
629 // Check isBoundary()
630 for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
631 UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
632 UBool boundaryFound = t->bi->isBoundary(i);
633 if (boundaryExpected != boundaryFound) {
634 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
635 " Expected, Actual= %s, %s",
636 i, t->getSrcLine(i), t->getSrcCol(i),
637 boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
638 }
639 }
640
641 // Check following()
642 for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
643 int32_t actualBreak = t->bi->following(i);
644 int32_t expectedBreak = BreakIterator::DONE;
645 for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) {
646 if (t->getExpectedBreak(j) != 0) {
647 expectedBreak = j;
648 break;
649 }
650 }
651 if (expectedBreak != actualBreak) {
652 errln("following(%d) incorrect. File line,col= %4d,%4d\n"
653 " Expected, Actual= %d, %d",
654 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
655 }
656 }
657
658 // Check preceding()
659 for (i=utext_nativeLength(t->textToBreak); i>=0; i--) {
660 int32_t actualBreak = t->bi->preceding(i);
661 int32_t expectedBreak = BreakIterator::DONE;
662
663 // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
664 // preceding(trailing byte) will return the index of some preceding code point,
665 // not the lead byte of the current code point, even though that has a smaller index.
666 // Therefore, start looking at the expected break data not at i-1, but at
667 // the start of code point index - 1.
668 utext_setNativeIndex(t->textToBreak, i);
669 int32_t j = utext_getNativeIndex(t->textToBreak) - 1;
670 for (; j >= 0; j--) {
671 if (t->getExpectedBreak(j) != 0) {
672 expectedBreak = j;
673 break;
674 }
675 }
676 if (expectedBreak != actualBreak) {
677 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
678 " Expected, Actual= %d, %d",
679 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
680 }
681 }
682 }
683
684
685 void RBBITest::TestExtended() {
686 // Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This
687 // data driven test closely entangles filtered and regular data.
688 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION
689 UErrorCode status = U_ZERO_ERROR;
690 Locale locale("");
691
692 TestParams tp(status);
693
694 RegexMatcher localeMatcher(UnicodeString(u"<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
695 if (U_FAILURE(status)) {
696 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
697 }
698
699 //
700 // Open and read the test data file.
701 //
702 const char *testDataDirectory = IntlTest::getSourceTestData(status);
703 CharString testFileName(testDataDirectory, -1, status);
704 testFileName.append("rbbitst.txt", -1, status);
705
706 int len;
707 UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
708 if (U_FAILURE(status)) {
709 errln("%s:%d Error %s opening file rbbitst.txt", __FILE__, __LINE__, u_errorName(status));
710 return;
711 }
712
713 bool skipTest = false; // Skip this test?
714
715 //
716 // Put the test data into a UnicodeString
717 //
718 UnicodeString testString(FALSE, testFile, len);
719
720 enum EParseState{
721 PARSE_COMMENT,
722 PARSE_TAG,
723 PARSE_DATA,
724 PARSE_NUM,
725 PARSE_RULES
726 }
727 parseState = PARSE_TAG;
728
729 EParseState savedState = PARSE_TAG;
730
731 int32_t lineNum = 1;
732 int32_t colStart = 0;
733 int32_t column = 0;
734 int32_t charIdx = 0;
735
736 int32_t tagValue = 0; // The numeric value of a <nnn> tag.
737
738 UnicodeString rules; // Holds rules from a <rules> ... </rules> block
739 int32_t rulesFirstLine; // Line number of the start of current <rules> block
740
741 for (charIdx = 0; charIdx < len; ) {
742 status = U_ZERO_ERROR;
743 UChar c = testString.charAt(charIdx);
744 charIdx++;
745 if (c == u'\r' && charIdx<len && testString.charAt(charIdx) == u'\n') {
746 // treat CRLF as a unit
747 c = u'\n';
748 charIdx++;
749 }
750 if (c == u'\n' || c == u'\r') {
751 lineNum++;
752 colStart = charIdx;
753 }
754 column = charIdx - colStart + 1;
755
756 switch (parseState) {
757 case PARSE_COMMENT:
758 if (c == u'\n' || c == u'\r') {
759 parseState = savedState;
760 }
761 break;
762
763 case PARSE_TAG:
764 {
765 if (c == u'#') {
766 parseState = PARSE_COMMENT;
767 savedState = PARSE_TAG;
768 break;
769 }
770 if (u_isUWhiteSpace(c)) {
771 break;
772 }
773 if (testString.compare(charIdx-1, 6, u"<word>") == 0) {
774 delete tp.bi;
775 tp.bi = BreakIterator::createWordInstance(locale, status);
776 skipTest = false;
777 charIdx += 5;
778 break;
779 }
780 if (testString.compare(charIdx-1, 6, u"<char>") == 0) {
781 delete tp.bi;
782 tp.bi = BreakIterator::createCharacterInstance(locale, status);
783 skipTest = false;
784 charIdx += 5;
785 break;
786 }
787 if (testString.compare(charIdx-1, 6, u"<line>") == 0) {
788 delete tp.bi;
789 tp.bi = BreakIterator::createLineInstance(locale, status);
790 skipTest = false;
791 charIdx += 5;
792 break;
793 }
794 if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
795 delete tp.bi;
796 tp.bi = BreakIterator::createSentenceInstance(locale, status);
797 skipTest = false;
798 charIdx += 5;
799 break;
800 }
801 if (testString.compare(charIdx-1, 7, u"<title>") == 0) {
802 delete tp.bi;
803 tp.bi = BreakIterator::createTitleInstance(locale, status);
804 charIdx += 6;
805 break;
806 }
807
808 if (testString.compare(charIdx-1, 7, u"<rules>") == 0 ||
809 testString.compare(charIdx-1, 10, u"<badrules>") == 0) {
810 charIdx = testString.indexOf(u'>', charIdx) + 1;
811 parseState = PARSE_RULES;
812 rules.remove();
813 rulesFirstLine = lineNum;
814 break;
815 }
816
817 // <locale loc_name>
818 localeMatcher.reset(testString);
819 if (localeMatcher.lookingAt(charIdx-1, status)) {
820 UnicodeString localeName = localeMatcher.group(1, status);
821 char localeName8[100];
822 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
823 locale = Locale::createFromName(localeName8);
824 charIdx += localeMatcher.group(0, status).length() - 1;
825 TEST_ASSERT_SUCCESS(status);
826 break;
827 }
828 if (testString.compare(charIdx-1, 6, u"<data>") == 0) {
829 parseState = PARSE_DATA;
830 charIdx += 5;
831 tp.dataToBreak = "";
832 tp.expectedBreaks->removeAllElements();
833 tp.srcCol ->removeAllElements();
834 tp.srcLine->removeAllElements();
835 break;
836 }
837
838 errln("line %d: Tag expected in test file.", lineNum);
839 parseState = PARSE_COMMENT;
840 savedState = PARSE_DATA;
841 goto end_test; // Stop the test.
842 }
843 break;
844
845 case PARSE_RULES:
846 if (testString.compare(charIdx-1, 8, u"</rules>") == 0) {
847 charIdx += 7;
848 parseState = PARSE_TAG;
849 delete tp.bi;
850 UParseError pe;
851 tp.bi = new RuleBasedBreakIterator(rules, pe, status);
852 skipTest = U_FAILURE(status);
853 if (U_FAILURE(status)) {
854 errln("file rbbitst.txt: %d - Error %s creating break iterator from rules.",
855 rulesFirstLine + pe.line - 1, u_errorName(status));
856 }
857 } else if (testString.compare(charIdx-1, 11, u"</badrules>") == 0) {
858 charIdx += 10;
859 parseState = PARSE_TAG;
860 UErrorCode ec = U_ZERO_ERROR;
861 UParseError pe;
862 RuleBasedBreakIterator bi(rules, pe, ec);
863 if (U_SUCCESS(ec)) {
864 errln("file rbbitst.txt: %d - Expected, but did not get, a failure creating break iterator from rules.",
865 rulesFirstLine + pe.line - 1);
866 }
867 } else {
868 rules.append(c);
869 }
870 break;
871
872 case PARSE_DATA:
873 if (c == u'\u2022') { // u'•'
874 int32_t breakIdx = tp.dataToBreak.length();
875 tp.expectedBreaks->setSize(breakIdx+1);
876 tp.expectedBreaks->setElementAt(-1, breakIdx);
877 tp.srcLine->setSize(breakIdx+1);
878 tp.srcLine->setElementAt(lineNum, breakIdx);
879 tp.srcCol ->setSize(breakIdx+1);
880 tp.srcCol ->setElementAt(column, breakIdx);
881 break;
882 }
883
884 if (testString.compare(charIdx-1, 7, u"</data>") == 0) {
885 // Add final entry to mappings from break location to source file position.
886 // Need one extra because last break position returned is after the
887 // last char in the data, not at the last char.
888 tp.srcLine->addElement(lineNum, status);
889 tp.srcCol ->addElement(column, status);
890
891 parseState = PARSE_TAG;
892 charIdx += 6;
893
894 if (!skipTest) {
895 // RUN THE TEST!
896 status = U_ZERO_ERROR;
897 tp.setUTF16(status);
898 executeTest(&tp, status);
899 TEST_ASSERT_SUCCESS(status);
900
901 // Run again, this time with UTF-8 text wrapped in a UText.
902 status = U_ZERO_ERROR;
903 tp.setUTF8(status);
904 TEST_ASSERT_SUCCESS(status);
905 executeTest(&tp, status);
906 }
907 break;
908 }
909
910 if (testString.compare(charIdx-1, 3, u"\\N{") == 0) {
911 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
912 // Get the code point from the name and insert it into the test data.
913 // (Damn, no API takes names in Unicode !!!
914 // we've got to take it back to char *)
915 int32_t nameEndIdx = testString.indexOf(u'}', charIdx);
916 int32_t nameLength = nameEndIdx - (charIdx+2);
917 char charNameBuf[200];
918 UChar32 theChar = -1;
919 if (nameEndIdx != -1) {
920 UErrorCode status = U_ZERO_ERROR;
921 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
922 charNameBuf[sizeof(charNameBuf)-1] = 0;
923 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
924 if (U_FAILURE(status)) {
925 theChar = -1;
926 }
927 }
928 if (theChar == -1) {
929 errln("Error in named character in test file at line %d, col %d",
930 lineNum, column);
931 } else {
932 // Named code point was recognized. Insert it
933 // into the test data.
934 tp.dataToBreak.append(theChar);
935 while (tp.dataToBreak.length() > tp.srcLine->size()) {
936 tp.srcLine->addElement(lineNum, status);
937 tp.srcCol ->addElement(column, status);
938 }
939 }
940 if (nameEndIdx > charIdx) {
941 charIdx = nameEndIdx+1;
942
943 }
944 break;
945 }
946
947
948
949 if (testString.compare(charIdx-1, 2, u"<>") == 0) {
950 charIdx++;
951 int32_t breakIdx = tp.dataToBreak.length();
952 tp.expectedBreaks->setSize(breakIdx+1);
953 tp.expectedBreaks->setElementAt(-1, breakIdx);
954 tp.srcLine->setSize(breakIdx+1);
955 tp.srcLine->setElementAt(lineNum, breakIdx);
956 tp.srcCol ->setSize(breakIdx+1);
957 tp.srcCol ->setElementAt(column, breakIdx);
958 break;
959 }
960
961 if (c == u'<') {
962 tagValue = 0;
963 parseState = PARSE_NUM;
964 break;
965 }
966
967 if (c == u'#' && column==3) { // TODO: why is column off so far?
968 parseState = PARSE_COMMENT;
969 savedState = PARSE_DATA;
970 break;
971 }
972
973 if (c == u'\\') {
974 // Check for \ at end of line, a line continuation.
975 // Advance over (discard) the newline
976 UChar32 cp = testString.char32At(charIdx);
977 if (cp == u'\r' && charIdx<len && testString.charAt(charIdx+1) == u'\n') {
978 // We have a CR LF
979 // Need an extra increment of the input ptr to move over both of them
980 charIdx++;
981 }
982 if (cp == u'\n' || cp == u'\r') {
983 lineNum++;
984 colStart = charIdx;
985 charIdx++;
986 break;
987 }
988
989 // Let unescape handle the back slash.
990 cp = testString.unescapeAt(charIdx);
991 if (cp != -1) {
992 // Escape sequence was recognized. Insert the char
993 // into the test data.
994 tp.dataToBreak.append(cp);
995 while (tp.dataToBreak.length() > tp.srcLine->size()) {
996 tp.srcLine->addElement(lineNum, status);
997 tp.srcCol ->addElement(column, status);
998 }
999 break;
1000 }
1001
1002
1003 // Not a recognized backslash escape sequence.
1004 // Take the next char as a literal.
1005 // TODO: Should this be an error?
1006 c = testString.charAt(charIdx);
1007 charIdx = testString.moveIndex32(charIdx, 1);
1008 }
1009
1010 // Normal, non-escaped data char.
1011 tp.dataToBreak.append(c);
1012
1013 // Save the mapping from offset in the data to line/column numbers in
1014 // the original input file. Will be used for better error messages only.
1015 // If there's an expected break before this char, the slot in the mapping
1016 // vector will already be set for this char; don't overwrite it.
1017 if (tp.dataToBreak.length() > tp.srcLine->size()) {
1018 tp.srcLine->addElement(lineNum, status);
1019 tp.srcCol ->addElement(column, status);
1020 }
1021 break;
1022
1023
1024 case PARSE_NUM:
1025 // We are parsing an expected numeric tag value, like <1234>,
1026 // within a chunk of data.
1027 if (u_isUWhiteSpace(c)) {
1028 break;
1029 }
1030
1031 if (c == u'>') {
1032 // Finished the number. Add the info to the expected break data,
1033 // and switch parse state back to doing plain data.
1034 parseState = PARSE_DATA;
1035 if (tagValue == 0) {
1036 tagValue = -1;
1037 }
1038 int32_t breakIdx = tp.dataToBreak.length();
1039 tp.expectedBreaks->setSize(breakIdx+1);
1040 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1041 tp.srcLine->setSize(breakIdx+1);
1042 tp.srcLine->setElementAt(lineNum, breakIdx);
1043 tp.srcCol ->setSize(breakIdx+1);
1044 tp.srcCol ->setElementAt(column, breakIdx);
1045 break;
1046 }
1047
1048 if (u_isdigit(c)) {
1049 tagValue = tagValue*10 + u_charDigitValue(c);
1050 break;
1051 }
1052
1053 errln("Syntax Error in test file at line %d, col %d",
1054 lineNum, column);
1055 parseState = PARSE_COMMENT;
1056 goto end_test; // Stop the test
1057 break;
1058 }
1059
1060
1061 if (U_FAILURE(status)) {
1062 dataerrln("ICU Error %s while parsing test file at line %d.",
1063 u_errorName(status), lineNum);
1064 status = U_ZERO_ERROR;
1065 goto end_test; // Stop the test
1066 }
1067
1068 }
1069
1070 // Reached end of test file. Raise an error if parseState indicates that we are
1071 // within a block that should have been terminated.
1072
1073 if (parseState == PARSE_RULES) {
1074 errln("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
1075 lineNum, rulesFirstLine);
1076 }
1077 if (parseState == PARSE_DATA) {
1078 errln("rbbitst.txt:%d <data> block not closed.", lineNum);
1079 }
1080
1081
1082 end_test:
1083 delete [] testFile;
1084 #endif
1085 }
1086
1087
1088 //-------------------------------------------------------------------------------
1089 //
1090 // TestDictRules create a break iterator from source rules that includes a
1091 // dictionary range. Regression for bug #7130. Source rules
1092 // do not declare a break iterator type (word, line, sentence, etc.
1093 // but the dictionary code, without a type, would loop.
1094 //
1095 //-------------------------------------------------------------------------------
1096 void RBBITest::TestDictRules() {
1097 const char *rules = "$dictionary = [a-z]; \n"
1098 "!!forward; \n"
1099 "$dictionary $dictionary; \n"
1100 "!!reverse; \n"
1101 "$dictionary $dictionary; \n";
1102 const char *text = "aa";
1103 UErrorCode status = U_ZERO_ERROR;
1104 UParseError parseError;
1105
1106 RuleBasedBreakIterator bi(rules, parseError, status);
1107 if (U_SUCCESS(status)) {
1108 UnicodeString utext = text;
1109 bi.setText(utext);
1110 int32_t position;
1111 int32_t loops;
1112 for (loops = 0; loops<10; loops++) {
1113 position = bi.next();
1114 if (position == RuleBasedBreakIterator::DONE) {
1115 break;
1116 }
1117 }
1118 TEST_ASSERT(loops == 1);
1119 } else {
1120 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1121 }
1122 }
1123
1124
1125
1126 //-------------------------------------------------------------------------------
1127 //
1128 // ReadAndConvertFile Read a text data file, convert it to UChars, and
1129 // return the data in one big UChar * buffer, which the caller must delete.
1130 //
1131 // parameters:
1132 // fileName: the name of the file, with no directory part. The test data directory
1133 // is assumed.
1134 // ulen an out parameter, receives the actual length (in UChars) of the file data.
1135 // encoding The file encoding. If the file contains a BOM, that will override the encoding
1136 // specified here. The BOM, if it exists, will be stripped from the returned data.
1137 // Pass NULL for the system default encoding.
1138 // status
1139 // returns:
1140 // The file data, converted to UChar.
1141 // The caller must delete this when done with
1142 // delete [] theBuffer;
1143 //
1144 // TODO: This is a clone of RegexTest::ReadAndConvertFile.
1145 // Move this function to some common place.
1146 //
1147 //--------------------------------------------------------------------------------
1148 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
1149 UChar *retPtr = NULL;
1150 char *fileBuf = NULL;
1151 UConverter* conv = NULL;
1152 FILE *f = NULL;
1153
1154 ulen = 0;
1155 if (U_FAILURE(status)) {
1156 return retPtr;
1157 }
1158
1159 //
1160 // Open the file.
1161 //
1162 f = fopen(fileName, "rb");
1163 if (f == 0) {
1164 dataerrln("Error opening test data file %s\n", fileName);
1165 status = U_FILE_ACCESS_ERROR;
1166 return NULL;
1167 }
1168 //
1169 // Read it in
1170 //
1171 int fileSize;
1172 int amt_read;
1173
1174 fseek( f, 0, SEEK_END);
1175 fileSize = ftell(f);
1176 fileBuf = new char[fileSize];
1177 fseek(f, 0, SEEK_SET);
1178 amt_read = fread(fileBuf, 1, fileSize, f);
1179 if (amt_read != fileSize || fileSize <= 0) {
1180 errln("Error reading test data file.");
1181 goto cleanUpAndReturn;
1182 }
1183
1184 //
1185 // Look for a Unicode Signature (BOM) on the data just read
1186 //
1187 int32_t signatureLength;
1188 const char * fileBufC;
1189 const char* bomEncoding;
1190
1191 fileBufC = fileBuf;
1192 bomEncoding = ucnv_detectUnicodeSignature(
1193 fileBuf, fileSize, &signatureLength, &status);
1194 if(bomEncoding!=NULL ){
1195 fileBufC += signatureLength;
1196 fileSize -= signatureLength;
1197 encoding = bomEncoding;
1198 }
1199
1200 //
1201 // Open a converter to take the rule file to UTF-16
1202 //
1203 conv = ucnv_open(encoding, &status);
1204 if (U_FAILURE(status)) {
1205 goto cleanUpAndReturn;
1206 }
1207
1208 //
1209 // Convert the rules to UChar.
1210 // Preflight first to determine required buffer size.
1211 //
1212 ulen = ucnv_toUChars(conv,
1213 NULL, // dest,
1214 0, // destCapacity,
1215 fileBufC,
1216 fileSize,
1217 &status);
1218 if (status == U_BUFFER_OVERFLOW_ERROR) {
1219 // Buffer Overflow is expected from the preflight operation.
1220 status = U_ZERO_ERROR;
1221
1222 retPtr = new UChar[ulen+1];
1223 ucnv_toUChars(conv,
1224 retPtr, // dest,
1225 ulen+1,
1226 fileBufC,
1227 fileSize,
1228 &status);
1229 }
1230
1231 cleanUpAndReturn:
1232 fclose(f);
1233 delete []fileBuf;
1234 ucnv_close(conv);
1235 if (U_FAILURE(status)) {
1236 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1237 delete []retPtr;
1238 retPtr = 0;
1239 ulen = 0;
1240 };
1241 return retPtr;
1242 }
1243
1244
1245
1246 //--------------------------------------------------------------------------------------------
1247 //
1248 // Run tests from each of the boundary test data files distributed by the Unicode Consortium
1249 //
1250 //-------------------------------------------------------------------------------------------
1251 void RBBITest::TestUnicodeFiles() {
1252 RuleBasedBreakIterator *bi;
1253 UErrorCode status = U_ZERO_ERROR;
1254
1255 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1256 TEST_ASSERT_SUCCESS(status);
1257 if (U_SUCCESS(status)) {
1258 runUnicodeTestData("GraphemeBreakTest.txt", bi);
1259 }
1260 delete bi;
1261
1262 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1263 TEST_ASSERT_SUCCESS(status);
1264 if (U_SUCCESS(status)) {
1265 runUnicodeTestData("WordBreakTest.txt", bi);
1266 }
1267 delete bi;
1268
1269 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1270 TEST_ASSERT_SUCCESS(status);
1271 if (U_SUCCESS(status)) {
1272 runUnicodeTestData("SentenceBreakTest.txt", bi);
1273 }
1274 delete bi;
1275
1276 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1277 TEST_ASSERT_SUCCESS(status);
1278 if (U_SUCCESS(status)) {
1279 runUnicodeTestData("LineBreakTest.txt", bi);
1280 }
1281 delete bi;
1282 }
1283
1284
1285 // Check for test cases from the Unicode test data files that are known to fail
1286 // and should be skipped because ICU is not yet able to fully implement the spec.
1287 // See ticket #7270.
1288
1289 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
1290 static struct TestCase {
1291 const char *fFileName;
1292 const UChar *fString;
1293 } badTestCases[] = { // Line Numbers from Unicode 7.0.0 file.
1294 {"LineBreakTest.txt", u"\u200B\u0020}"}, // Line 5198
1295 {"LineBreakTest.txt", u"\u200B\u0020)"}, // Line 5202
1296 {"LineBreakTest.txt", u"\u200B\u0020!"}, // Line 5214
1297 {"LineBreakTest.txt", u"\u200B\u0020,"}, // Line 5246
1298 {"LineBreakTest.txt", u"\u200B\u0020/"}, // Line 5298
1299 {"LineBreakTest.txt", u"\u200B\u0020\u2060"}, // Line 5302
1300 // Line Numbers from pre-release verion of GraphemeBreakTest-10.0.0.txt
1301 {"GraphemeBreakTest.txt", u"\u200D\u2640"}, // Line 656, old GB 11 test ZWJ x GAZ
1302 {"GraphemeBreakTest.txt", u"\u200D\U0001F466"}, // Line 658, old GB 11 test ZWJ x EBG
1303 {"GraphemeBreakTest.txt", u"\u200D\U0001F466\U0001F3FB"}, // Line 842, old GB 11 test ZWJ x EBG x EModifier
1304
1305 // Line Numbers from pre-release verion of WordBreakTest-10.0.0.txt
1306 {"WordBreakTest.txt", u"\u200D\u261D"}, // Line 1356, ZWJ x EmojiNRK
1307 {"WordBreakTest.txt", u"\u200D\U0001F3FB"}, // Line 1358, ZWJ x EmojiNRK
1308 };
1309
1310 for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
1311 const TestCase &badCase = badTestCases[n];
1312 if (!strcmp(fileName, badCase.fFileName) &&
1313 testCase == UnicodeString(badCase.fString)) {
1314 return logKnownIssue("7270");
1315 }
1316 }
1317 return FALSE;
1318 }
1319
1320
1321 //--------------------------------------------------------------------------------------------
1322 //
1323 // Run tests from one of the boundary test data files distributed by the Unicode Consortium
1324 //
1325 //-------------------------------------------------------------------------------------------
1326 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1327 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1328 UErrorCode status = U_ZERO_ERROR;
1329
1330 //
1331 // Open and read the test data file, put it into a UnicodeString.
1332 //
1333 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1334 char testFileName[1000];
1335 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1336 dataerrln("Can't open test data. Path too long.");
1337 return;
1338 }
1339 strcpy(testFileName, testDataDirectory);
1340 strcat(testFileName, fileName);
1341
1342 logln("Opening data file %s\n", fileName);
1343
1344 int len;
1345 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1346 if (status != U_FILE_ACCESS_ERROR) {
1347 TEST_ASSERT_SUCCESS(status);
1348 TEST_ASSERT(testFile != NULL);
1349 }
1350 if (U_FAILURE(status) || testFile == NULL) {
1351 return; /* something went wrong, error already output */
1352 }
1353 UnicodeString testFileAsString(TRUE, testFile, len);
1354
1355 //
1356 // Parse the test data file using a regular expression.
1357 // Each kind of token is recognized in its own capture group; what type of item was scanned
1358 // is identified by which group had a match.
1359 //
1360 // Caputure Group # 1 2 3 4 5
1361 // Parses this item: divide x hex digits comment \n unrecognized \n
1362 //
1363 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1364 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1365 UnicodeString testString;
1366 UVector32 breakPositions(status);
1367 int lineNumber = 1;
1368 TEST_ASSERT_SUCCESS(status);
1369 if (U_FAILURE(status)) {
1370 return;
1371 }
1372
1373 //
1374 // Scan through each test case, building up the string to be broken in testString,
1375 // and the positions that should be boundaries in the breakPositions vector.
1376 //
1377 int spin = 0;
1378 while (tokenMatcher.find()) {
1379 if(tokenMatcher.hitEnd()) {
1380 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for.
1381 This occurred when the text file was corrupt (wasn't marked as UTF-8)
1382 and caused an infinite loop here on EBCDIC systems!
1383 */
1384 fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1385 // return;
1386 }
1387 if (tokenMatcher.start(1, status) >= 0) {
1388 // Scanned a divide sign, indicating a break position in the test data.
1389 if (testString.length()>0) {
1390 breakPositions.addElement(testString.length(), status);
1391 }
1392 }
1393 else if (tokenMatcher.start(2, status) >= 0) {
1394 // Scanned an 'x', meaning no break at this position in the test data
1395 // Nothing to be done here.
1396 }
1397 else if (tokenMatcher.start(3, status) >= 0) {
1398 // Scanned Hex digits. Convert them to binary, append to the character data string.
1399 const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1400 int length = hexNumber.length();
1401 if (length<=8) {
1402 char buf[10];
1403 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1404 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1405 if (c<=0x10ffff) {
1406 testString.append(c);
1407 } else {
1408 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1409 fileName, lineNumber);
1410 }
1411 } else {
1412 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1413 fileName, lineNumber);
1414 }
1415 }
1416 else if (tokenMatcher.start(4, status) >= 0) {
1417 // Scanned to end of a line, possibly skipping over a comment in the process.
1418 // If the line from the file contained test data, run the test now.
1419 if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
1420 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1421 }
1422
1423 // Clear out this test case.
1424 // The string and breakPositions vector will be refilled as the next
1425 // test case is parsed.
1426 testString.remove();
1427 breakPositions.removeAllElements();
1428 lineNumber++;
1429 } else {
1430 // Scanner catchall. Something unrecognized appeared on the line.
1431 char token[16];
1432 UnicodeString uToken = tokenMatcher.group(0, status);
1433 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1434 token[sizeof(token)-1] = 0;
1435 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token);
1436
1437 // Clean up, in preparation for continuing with the next line.
1438 testString.remove();
1439 breakPositions.removeAllElements();
1440 lineNumber++;
1441 }
1442 TEST_ASSERT_SUCCESS(status);
1443 if (U_FAILURE(status)) {
1444 break;
1445 }
1446 }
1447
1448 delete [] testFile;
1449 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
1450 }
1451
1452 //--------------------------------------------------------------------------------------------
1453 //
1454 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium
1455 // test data files. Do only a simple, forward-only check -
1456 // this test is mostly to check that ICU and the Unicode
1457 // data agree with each other.
1458 //
1459 //--------------------------------------------------------------------------------------------
1460 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1461 const UnicodeString &testString, // Text data to be broken
1462 UVector32 *breakPositions, // Positions where breaks should be found.
1463 RuleBasedBreakIterator *bi) {
1464 int32_t pos; // Break Position in the test string
1465 int32_t expectedI = 0; // Index of expected break position in the vector of expected results.
1466 int32_t expectedPos; // Expected break position (index into test string)
1467
1468 bi->setText(testString);
1469 pos = bi->first();
1470 pos = bi->next();
1471
1472 while (pos != BreakIterator::DONE) {
1473 if (expectedI >= breakPositions->size()) {
1474 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1475 testFileName, lineNumber, pos);
1476 break;
1477 }
1478 expectedPos = breakPositions->elementAti(expectedI);
1479 if (pos < expectedPos) {
1480 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1481 testFileName, lineNumber, pos);
1482 break;
1483 }
1484 if (pos > expectedPos) {
1485 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1486 testFileName, lineNumber, expectedPos);
1487 break;
1488 }
1489 pos = bi->next();
1490 expectedI++;
1491 }
1492
1493 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1494 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1495 testFileName, lineNumber, breakPositions->elementAti(expectedI));
1496 }
1497 }
1498
1499
1500
1501 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1502 //---------------------------------------------------------------------------------------
1503 //
1504 // classs RBBIMonkeyKind
1505 //
1506 // Monkey Test for Break Iteration
1507 // Abstract interface class. Concrete derived classes independently
1508 // implement the break rules for different iterator types.
1509 //
1510 // The Monkey Test itself uses doesn't know which type of break iterator it is
1511 // testing, but works purely in terms of the interface defined here.
1512 //
1513 //---------------------------------------------------------------------------------------
1514 class RBBIMonkeyKind {
1515 public:
1516 // Return a UVector of UnicodeSets, representing the character classes used
1517 // for this type of iterator.
1518 virtual UVector *charClasses() = 0;
1519
1520 // Set the test text on which subsequent calls to next() will operate
1521 virtual void setText(const UnicodeString &s) = 0;
1522
1523 // Find the next break postion, starting from the prev break position, or from zero.
1524 // Return -1 after reaching end of string.
1525 virtual int32_t next(int32_t i) = 0;
1526
1527 virtual ~RBBIMonkeyKind();
1528 UErrorCode deferredStatus;
1529
1530
1531 protected:
1532 RBBIMonkeyKind();
1533
1534 private:
1535 };
1536
1537 RBBIMonkeyKind::RBBIMonkeyKind() {
1538 deferredStatus = U_ZERO_ERROR;
1539 }
1540
1541 RBBIMonkeyKind::~RBBIMonkeyKind() {
1542 }
1543
1544
1545 //----------------------------------------------------------------------------------------
1546 //
1547 // Random Numbers. Similar to standard lib rand() and srand()
1548 // Not using library to
1549 // 1. Get same results on all platforms.
1550 // 2. Get access to current seed, to more easily reproduce failures.
1551 //
1552 //---------------------------------------------------------------------------------------
1553 static uint32_t m_seed = 1;
1554
1555 static uint32_t m_rand()
1556 {
1557 m_seed = m_seed * 1103515245 + 12345;
1558 return (uint32_t)(m_seed/65536) % 32768;
1559 }
1560
1561
1562 //------------------------------------------------------------------------------------------
1563 //
1564 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation
1565 // of RBBIMonkeyKind.
1566 //
1567 //------------------------------------------------------------------------------------------
1568 class RBBICharMonkey: public RBBIMonkeyKind {
1569 public:
1570 RBBICharMonkey();
1571 virtual ~RBBICharMonkey();
1572 virtual UVector *charClasses();
1573 virtual void setText(const UnicodeString &s);
1574 virtual int32_t next(int32_t i);
1575 private:
1576 UVector *fSets;
1577
1578 UnicodeSet *fCRLFSet;
1579 UnicodeSet *fControlSet;
1580 UnicodeSet *fExtendSet;
1581 UnicodeSet *fZWJSet;
1582 UnicodeSet *fRegionalIndicatorSet;
1583 UnicodeSet *fPrependSet;
1584 UnicodeSet *fSpacingSet;
1585 UnicodeSet *fLSet;
1586 UnicodeSet *fVSet;
1587 UnicodeSet *fTSet;
1588 UnicodeSet *fLVSet;
1589 UnicodeSet *fLVTSet;
1590 UnicodeSet *fHangulSet;
1591 UnicodeSet *fExtendedPictSet;
1592 UnicodeSet *fAnySet;
1593
1594 const UnicodeString *fText;
1595 };
1596
1597
1598 RBBICharMonkey::RBBICharMonkey() {
1599 UErrorCode status = U_ZERO_ERROR;
1600
1601 fText = NULL;
1602
1603 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
1604 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
1605 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
1606 fZWJSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
1607 fRegionalIndicatorSet =
1608 new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
1609 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
1610 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
1611 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
1612 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
1613 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
1614 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
1615 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
1616 fHangulSet = new UnicodeSet();
1617 fHangulSet->addAll(*fLSet);
1618 fHangulSet->addAll(*fVSet);
1619 fHangulSet->addAll(*fTSet);
1620 fHangulSet->addAll(*fLVSet);
1621 fHangulSet->addAll(*fLVTSet);
1622
1623 fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
1624 fAnySet = new UnicodeSet(0, 0x10ffff);
1625
1626 fSets = new UVector(status);
1627 fSets->addElement(fCRLFSet, status);
1628 fSets->addElement(fControlSet, status);
1629 fSets->addElement(fExtendSet, status);
1630 fSets->addElement(fRegionalIndicatorSet, status);
1631 if (!fPrependSet->isEmpty()) {
1632 fSets->addElement(fPrependSet, status);
1633 }
1634 fSets->addElement(fSpacingSet, status);
1635 fSets->addElement(fHangulSet, status);
1636 fSets->addElement(fAnySet, status);
1637 fSets->addElement(fZWJSet, status);
1638 fSets->addElement(fExtendedPictSet, status);
1639 if (U_FAILURE(status)) {
1640 deferredStatus = status;
1641 }
1642 }
1643
1644
1645 void RBBICharMonkey::setText(const UnicodeString &s) {
1646 fText = &s;
1647 }
1648
1649
1650
1651 int32_t RBBICharMonkey::next(int32_t prevPos) {
1652 int p0, p1, p2, p3; // Indices of the significant code points around the
1653 // break position being tested. The candidate break
1654 // location is before p2.
1655
1656 int breakPos = -1;
1657
1658 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
1659 UChar32 cBase; // for (X Extend*) patterns, the X character.
1660
1661 if (U_FAILURE(deferredStatus)) {
1662 return -1;
1663 }
1664
1665 // Previous break at end of string. return DONE.
1666 if (prevPos >= fText->length()) {
1667 return -1;
1668 }
1669 p0 = p1 = p2 = p3 = prevPos;
1670 c3 = fText->char32At(prevPos);
1671 c0 = c1 = c2 = cBase = 0;
1672 (void)p0; // suppress set but not used warning.
1673 (void)c0;
1674
1675 // Loop runs once per "significant" character position in the input text.
1676 for (;;) {
1677 // Move all of the positions forward in the input string.
1678 p0 = p1; c0 = c1;
1679 p1 = p2; c1 = c2;
1680 p2 = p3; c2 = c3;
1681
1682 // Advancd p3 by one codepoint
1683 p3 = fText->moveIndex32(p3, 1);
1684 c3 = fText->char32At(p3);
1685
1686 if (p1 == p2) {
1687 // Still warming up the loop. (won't work with zero length strings, but we don't care)
1688 continue;
1689 }
1690 if (p2 == fText->length()) {
1691 // Reached end of string. Always a break position.
1692 break;
1693 }
1694
1695 // Rule GB3 CR x LF
1696 // No Extend or Format characters may appear between the CR and LF,
1697 // which requires the additional check for p2 immediately following p1.
1698 //
1699 if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
1700 continue;
1701 }
1702
1703 // Rule (GB4). ( Control | CR | LF ) <break>
1704 if (fControlSet->contains(c1) ||
1705 c1 == 0x0D ||
1706 c1 == 0x0A) {
1707 break;
1708 }
1709
1710 // Rule (GB5) <break> ( Control | CR | LF )
1711 //
1712 if (fControlSet->contains(c2) ||
1713 c2 == 0x0D ||
1714 c2 == 0x0A) {
1715 break;
1716 }
1717
1718
1719 // Rule (GB6) L x ( L | V | LV | LVT )
1720 if (fLSet->contains(c1) &&
1721 (fLSet->contains(c2) ||
1722 fVSet->contains(c2) ||
1723 fLVSet->contains(c2) ||
1724 fLVTSet->contains(c2))) {
1725 continue;
1726 }
1727
1728 // Rule (GB7) ( LV | V ) x ( V | T )
1729 if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
1730 (fVSet->contains(c2) || fTSet->contains(c2))) {
1731 continue;
1732 }
1733
1734 // Rule (GB8) ( LVT | T) x T
1735 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
1736 fTSet->contains(c2)) {
1737 continue;
1738 }
1739
1740 // Rule (GB9) x (Extend | ZWJ)
1741 if (fExtendSet->contains(c2) || fZWJSet->contains(c2)) {
1742 if (!fExtendSet->contains(c1)) {
1743 cBase = c1;
1744 }
1745 continue;
1746 }
1747
1748 // Rule (GB9a) x SpacingMark
1749 if (fSpacingSet->contains(c2)) {
1750 continue;
1751 }
1752
1753 // Rule (GB9b) Prepend x
1754 if (fPrependSet->contains(c1)) {
1755 continue;
1756 }
1757
1758 // Rule (GB11) Extended_Pictographic Extend * ZWJ x Extended_Pictographic
1759 if (fExtendedPictSet->contains(cBase) && fZWJSet->contains(c1) && fExtendedPictSet->contains(c2)) {
1760 continue;
1761 }
1762
1763 // Rule (GB12-13) Regional_Indicator x Regional_Indicator
1764 // Note: The first if condition is a little tricky. We only need to force
1765 // a break if there are three or more contiguous RIs. If there are
1766 // only two, a break following will occur via other rules, and will include
1767 // any trailing extend characters, which is needed behavior.
1768 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
1769 && fRegionalIndicatorSet->contains(c2)) {
1770 break;
1771 }
1772 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
1773 continue;
1774 }
1775
1776 // Rule (GB999) Any <break> Any
1777 break;
1778 }
1779
1780 breakPos = p2;
1781 return breakPos;
1782 }
1783
1784
1785
1786 UVector *RBBICharMonkey::charClasses() {
1787 return fSets;
1788 }
1789
1790
1791 RBBICharMonkey::~RBBICharMonkey() {
1792 delete fSets;
1793 delete fCRLFSet;
1794 delete fControlSet;
1795 delete fExtendSet;
1796 delete fRegionalIndicatorSet;
1797 delete fPrependSet;
1798 delete fSpacingSet;
1799 delete fLSet;
1800 delete fVSet;
1801 delete fTSet;
1802 delete fLVSet;
1803 delete fLVTSet;
1804 delete fHangulSet;
1805 delete fAnySet;
1806 delete fZWJSet;
1807 delete fExtendedPictSet;
1808 }
1809
1810 //------------------------------------------------------------------------------------------
1811 //
1812 // class RBBIWordMonkey Word Break specific implementation
1813 // of RBBIMonkeyKind.
1814 //
1815 //------------------------------------------------------------------------------------------
1816 class RBBIWordMonkey: public RBBIMonkeyKind {
1817 public:
1818 RBBIWordMonkey();
1819 virtual ~RBBIWordMonkey();
1820 virtual UVector *charClasses();
1821 virtual void setText(const UnicodeString &s);
1822 virtual int32_t next(int32_t i);
1823 private:
1824 UVector *fSets;
1825
1826 UnicodeSet *fCRSet;
1827 UnicodeSet *fLFSet;
1828 UnicodeSet *fNewlineSet;
1829 UnicodeSet *fRegionalIndicatorSet;
1830 UnicodeSet *fKatakanaSet;
1831 UnicodeSet *fHebrew_LetterSet;
1832 UnicodeSet *fALetterSet;
1833 UnicodeSet *fSingle_QuoteSet;
1834 UnicodeSet *fDouble_QuoteSet;
1835 UnicodeSet *fMidNumLetSet;
1836 UnicodeSet *fMidLetterSet;
1837 UnicodeSet *fMidNumSet;
1838 UnicodeSet *fNumericSet;
1839 UnicodeSet *fFormatSet;
1840 UnicodeSet *fOtherSet;
1841 UnicodeSet *fExtendSet;
1842 UnicodeSet *fExtendNumLetSet;
1843 UnicodeSet *fWSegSpaceSet;
1844 UnicodeSet *fDictionarySet;
1845 UnicodeSet *fZWJSet;
1846 UnicodeSet *fExtendedPictSet;
1847
1848 const UnicodeString *fText;
1849 };
1850
1851
1852 RBBIWordMonkey::RBBIWordMonkey()
1853 {
1854 UErrorCode status = U_ZERO_ERROR;
1855
1856 fSets = new UVector(status);
1857
1858 fCRSet = new UnicodeSet(u"[\\p{Word_Break = CR}]", status);
1859 fLFSet = new UnicodeSet(u"[\\p{Word_Break = LF}]", status);
1860 fNewlineSet = new UnicodeSet(u"[\\p{Word_Break = Newline}]", status);
1861 fKatakanaSet = new UnicodeSet(u"[\\p{Word_Break = Katakana}]", status);
1862 fRegionalIndicatorSet = new UnicodeSet(u"[\\p{Word_Break = Regional_Indicator}]", status);
1863 fHebrew_LetterSet = new UnicodeSet(u"[\\p{Word_Break = Hebrew_Letter}]", status);
1864 fALetterSet = new UnicodeSet(u"[\\p{Word_Break = ALetter}]", status);
1865 fSingle_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Single_Quote}]", status);
1866 fDouble_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Double_Quote}]", status);
1867 fMidNumLetSet = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]", status);
1868 fMidLetterSet = new UnicodeSet(u"[\\p{Word_Break = MidLetter} - [\\:]]", status);
1869 fMidNumSet = new UnicodeSet(u"[\\p{Word_Break = MidNum}]", status);
1870 fNumericSet = new UnicodeSet(u"[\\p{Word_Break = Numeric}]", status);
1871 fFormatSet = new UnicodeSet(u"[\\p{Word_Break = Format}]", status);
1872 fExtendNumLetSet = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
1873 fExtendSet = new UnicodeSet(u"[\\p{Word_Break = Extend}]", status);
1874 fWSegSpaceSet = new UnicodeSet(u"[\\p{Word_Break = WSegSpace}]", status);
1875
1876 fZWJSet = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]", status);
1877 fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
1878
1879 fDictionarySet = new UnicodeSet(u"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
1880 fDictionarySet->addAll(*fKatakanaSet);
1881 fDictionarySet->addAll(UnicodeSet(u"[\\p{LineBreak = Complex_Context}]", status));
1882
1883 fALetterSet->removeAll(*fDictionarySet);
1884
1885 fOtherSet = new UnicodeSet();
1886 if(U_FAILURE(status)) {
1887 IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
1888 deferredStatus = status;
1889 return;
1890 }
1891
1892 fOtherSet->complement();
1893 fOtherSet->removeAll(*fCRSet);
1894 fOtherSet->removeAll(*fLFSet);
1895 fOtherSet->removeAll(*fNewlineSet);
1896 fOtherSet->removeAll(*fKatakanaSet);
1897 fOtherSet->removeAll(*fHebrew_LetterSet);
1898 fOtherSet->removeAll(*fALetterSet);
1899 fOtherSet->removeAll(*fSingle_QuoteSet);
1900 fOtherSet->removeAll(*fDouble_QuoteSet);
1901 fOtherSet->removeAll(*fMidLetterSet);
1902 fOtherSet->removeAll(*fMidNumSet);
1903 fOtherSet->removeAll(*fNumericSet);
1904 fOtherSet->removeAll(*fExtendNumLetSet);
1905 fOtherSet->removeAll(*fWSegSpaceSet);
1906 fOtherSet->removeAll(*fFormatSet);
1907 fOtherSet->removeAll(*fExtendSet);
1908 fOtherSet->removeAll(*fRegionalIndicatorSet);
1909 fOtherSet->removeAll(*fZWJSet);
1910 fOtherSet->removeAll(*fExtendedPictSet);
1911
1912 // Inhibit dictionary characters from being tested at all.
1913 fOtherSet->removeAll(*fDictionarySet);
1914
1915 fSets->addElement(fCRSet, status);
1916 fSets->addElement(fLFSet, status);
1917 fSets->addElement(fNewlineSet, status);
1918 fSets->addElement(fRegionalIndicatorSet, status);
1919 fSets->addElement(fHebrew_LetterSet, status);
1920 fSets->addElement(fALetterSet, status);
1921 fSets->addElement(fSingle_QuoteSet, status);
1922 fSets->addElement(fDouble_QuoteSet, status);
1923 //fSets->addElement(fKatakanaSet, status); // Omit Katakana from fSets, which omits Katakana characters
1924 // from the test data. They are all in the dictionary set,
1925 // which this (old, to be retired) monkey test cannot handle.
1926 fSets->addElement(fMidLetterSet, status);
1927 fSets->addElement(fMidNumLetSet, status);
1928 fSets->addElement(fMidNumSet, status);
1929 fSets->addElement(fNumericSet, status);
1930 fSets->addElement(fFormatSet, status);
1931 fSets->addElement(fExtendSet, status);
1932 fSets->addElement(fOtherSet, status);
1933 fSets->addElement(fExtendNumLetSet, status);
1934 fSets->addElement(fWSegSpaceSet, status);
1935
1936 fSets->addElement(fZWJSet, status);
1937 fSets->addElement(fExtendedPictSet, status);
1938
1939 if (U_FAILURE(status)) {
1940 deferredStatus = status;
1941 }
1942 }
1943
1944 void RBBIWordMonkey::setText(const UnicodeString &s) {
1945 fText = &s;
1946 }
1947
1948
1949 int32_t RBBIWordMonkey::next(int32_t prevPos) {
1950 int p0, p1, p2, p3; // Indices of the significant code points around the
1951 // break position being tested. The candidate break
1952 // location is before p2.
1953
1954 int breakPos = -1;
1955
1956 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
1957
1958 if (U_FAILURE(deferredStatus)) {
1959 return -1;
1960 }
1961
1962 // Prev break at end of string. return DONE.
1963 if (prevPos >= fText->length()) {
1964 return -1;
1965 }
1966 p0 = p1 = p2 = p3 = prevPos;
1967 c3 = fText->char32At(prevPos);
1968 c0 = c1 = c2 = 0;
1969 (void)p0; // Suppress set but not used warning.
1970
1971 // Loop runs once per "significant" character position in the input text.
1972 for (;;) {
1973 // Move all of the positions forward in the input string.
1974 p0 = p1; c0 = c1;
1975 p1 = p2; c1 = c2;
1976 p2 = p3; c2 = c3;
1977
1978 // Advancd p3 by X(Extend | Format)* Rule 4
1979 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
1980 do {
1981 p3 = fText->moveIndex32(p3, 1);
1982 c3 = fText->char32At(p3);
1983 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
1984 break;
1985 };
1986 }
1987 while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
1988
1989
1990 if (p1 == p2) {
1991 // Still warming up the loop. (won't work with zero length strings, but we don't care)
1992 continue;
1993 }
1994 if (p2 == fText->length()) {
1995 // Reached end of string. Always a break position.
1996 break;
1997 }
1998
1999 // Rule (3) CR x LF
2000 // No Extend or Format characters may appear between the CR and LF,
2001 // which requires the additional check for p2 immediately following p1.
2002 //
2003 if (c1==0x0D && c2==0x0A) {
2004 continue;
2005 }
2006
2007 // Rule (3a) Break before and after newlines (including CR and LF)
2008 //
2009 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2010 break;
2011 };
2012 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2013 break;
2014 };
2015
2016 // Rule (3c) ZWJ x Extended_Pictographic
2017 // Not ignoring extend chars, so peek into input text to
2018 // get the potential ZWJ, the character immediately preceding c2.
2019 // Sloppy UChar32 indexing: p2-1 may reference trail half
2020 // but char32At will get the full code point.
2021 if (fZWJSet->contains(fText->char32At(p2-1)) && fExtendedPictSet->contains(c2)) {
2022 continue;
2023 }
2024
2025 // Rule (3d) Keep horizontal whitespace together.
2026 if (fWSegSpaceSet->contains(fText->char32At(p2-1)) && fWSegSpaceSet->contains(c2)) {
2027 continue;
2028 }
2029
2030 // Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
2031 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2032 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2033 continue;
2034 }
2035
2036 // Rule (6) (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
2037 //
2038 if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2039 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2040 (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2041 continue;
2042 }
2043
2044 // Rule (7) (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter)
2045 if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2046 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2047 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2048 continue;
2049 }
2050
2051 // Rule (7a) Hebrew_Letter x Single_Quote
2052 if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2053 continue;
2054 }
2055
2056 // Rule (7b) Hebrew_Letter x Double_Quote Hebrew_Letter
2057 if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2058 continue;
2059 }
2060
2061 // Rule (7c) Hebrew_Letter Double_Quote x Hebrew_Letter
2062 if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2063 continue;
2064 }
2065
2066 // Rule (8) Numeric x Numeric
2067 if (fNumericSet->contains(c1) &&
2068 fNumericSet->contains(c2)) {
2069 continue;
2070 }
2071
2072 // Rule (9) (ALetter | Hebrew_Letter) x Numeric
2073 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2074 fNumericSet->contains(c2)) {
2075 continue;
2076 }
2077
2078 // Rule (10) Numeric x (ALetter | Hebrew_Letter)
2079 if (fNumericSet->contains(c1) &&
2080 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2081 continue;
2082 }
2083
2084 // Rule (11) Numeric (MidNum | MidNumLet | Single_Quote) x Numeric
2085 if (fNumericSet->contains(c0) &&
2086 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2087 fNumericSet->contains(c2)) {
2088 continue;
2089 }
2090
2091 // Rule (12) Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
2092 if (fNumericSet->contains(c1) &&
2093 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2094 fNumericSet->contains(c3)) {
2095 continue;
2096 }
2097
2098 // Rule (13) Katakana x Katakana
2099 // Note: matches UAX 29 rules, but doesn't come into play for ICU because
2100 // all Katakana are handled by the dictionary breaker.
2101 if (fKatakanaSet->contains(c1) &&
2102 fKatakanaSet->contains(c2)) {
2103 continue;
2104 }
2105
2106 // Rule 13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
2107 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2108 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2109 fExtendNumLetSet->contains(c2)) {
2110 continue;
2111 }
2112
2113 // Rule 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
2114 if (fExtendNumLetSet->contains(c1) &&
2115 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2116 fNumericSet->contains(c2) || fKatakanaSet->contains(c2))) {
2117 continue;
2118 }
2119
2120 // Rule 15 - 17 Group pairs of Regional Indicators.
2121 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
2122 break;
2123 }
2124 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2125 continue;
2126 }
2127
2128 // Rule 999. Break found here.
2129 break;
2130 }
2131
2132 breakPos = p2;
2133 return breakPos;
2134 }
2135
2136
2137 UVector *RBBIWordMonkey::charClasses() {
2138 return fSets;
2139 }
2140
2141
2142 RBBIWordMonkey::~RBBIWordMonkey() {
2143 delete fSets;
2144 delete fCRSet;
2145 delete fLFSet;
2146 delete fNewlineSet;
2147 delete fKatakanaSet;
2148 delete fHebrew_LetterSet;
2149 delete fALetterSet;
2150 delete fSingle_QuoteSet;
2151 delete fDouble_QuoteSet;
2152 delete fMidNumLetSet;
2153 delete fMidLetterSet;
2154 delete fMidNumSet;
2155 delete fNumericSet;
2156 delete fFormatSet;
2157 delete fExtendSet;
2158 delete fExtendNumLetSet;
2159 delete fWSegSpaceSet;
2160 delete fRegionalIndicatorSet;
2161 delete fDictionarySet;
2162 delete fOtherSet;
2163 delete fZWJSet;
2164 delete fExtendedPictSet;
2165 }
2166
2167
2168
2169
2170 //------------------------------------------------------------------------------------------
2171 //
2172 // class RBBISentMonkey Sentence Break specific implementation
2173 // of RBBIMonkeyKind.
2174 //
2175 //------------------------------------------------------------------------------------------
2176 class RBBISentMonkey: public RBBIMonkeyKind {
2177 public:
2178 RBBISentMonkey();
2179 virtual ~RBBISentMonkey();
2180 virtual UVector *charClasses();
2181 virtual void setText(const UnicodeString &s);
2182 virtual int32_t next(int32_t i);
2183 private:
2184 int moveBack(int posFrom);
2185 int moveForward(int posFrom);
2186 UChar32 cAt(int pos);
2187
2188 UVector *fSets;
2189
2190 UnicodeSet *fSepSet;
2191 UnicodeSet *fFormatSet;
2192 UnicodeSet *fSpSet;
2193 UnicodeSet *fLowerSet;
2194 UnicodeSet *fUpperSet;
2195 UnicodeSet *fOLetterSet;
2196 UnicodeSet *fNumericSet;
2197 UnicodeSet *fATermSet;
2198 UnicodeSet *fSContinueSet;
2199 UnicodeSet *fSTermSet;
2200 UnicodeSet *fCloseSet;
2201 UnicodeSet *fOtherSet;
2202 UnicodeSet *fExtendSet;
2203
2204 const UnicodeString *fText;
2205
2206 };
2207
2208 RBBISentMonkey::RBBISentMonkey()
2209 {
2210 UErrorCode status = U_ZERO_ERROR;
2211
2212 fSets = new UVector(status);
2213
2214 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
2215 // set and made into character classes of their own. For the monkey impl,
2216 // they remain in SEP, since Sep always appears with CR and LF in the rules.
2217 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status);
2218 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status);
2219 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status);
2220 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status);
2221 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status);
2222 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status);
2223 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status);
2224 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status);
2225 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2226 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status);
2227 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status);
2228 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status);
2229 fOtherSet = new UnicodeSet();
2230
2231 if(U_FAILURE(status)) {
2232 deferredStatus = status;
2233 return;
2234 }
2235
2236 fOtherSet->complement();
2237 fOtherSet->removeAll(*fSepSet);
2238 fOtherSet->removeAll(*fFormatSet);
2239 fOtherSet->removeAll(*fSpSet);
2240 fOtherSet->removeAll(*fLowerSet);
2241 fOtherSet->removeAll(*fUpperSet);
2242 fOtherSet->removeAll(*fOLetterSet);
2243 fOtherSet->removeAll(*fNumericSet);
2244 fOtherSet->removeAll(*fATermSet);
2245 fOtherSet->removeAll(*fSContinueSet);
2246 fOtherSet->removeAll(*fSTermSet);
2247 fOtherSet->removeAll(*fCloseSet);
2248 fOtherSet->removeAll(*fExtendSet);
2249
2250 fSets->addElement(fSepSet, status);
2251 fSets->addElement(fFormatSet, status);
2252 fSets->addElement(fSpSet, status);
2253 fSets->addElement(fLowerSet, status);
2254 fSets->addElement(fUpperSet, status);
2255 fSets->addElement(fOLetterSet, status);
2256 fSets->addElement(fNumericSet, status);
2257 fSets->addElement(fATermSet, status);
2258 fSets->addElement(fSContinueSet, status);
2259 fSets->addElement(fSTermSet, status);
2260 fSets->addElement(fCloseSet, status);
2261 fSets->addElement(fOtherSet, status);
2262 fSets->addElement(fExtendSet, status);
2263
2264 if (U_FAILURE(status)) {
2265 deferredStatus = status;
2266 }
2267 }
2268
2269
2270
2271 void RBBISentMonkey::setText(const UnicodeString &s) {
2272 fText = &s;
2273 }
2274
2275 UVector *RBBISentMonkey::charClasses() {
2276 return fSets;
2277 }
2278
2279
2280 // moveBack() Find the "significant" code point preceding the index i.
2281 // Skips over ($Extend | $Format)* .
2282 //
2283 int RBBISentMonkey::moveBack(int i) {
2284 if (i <= 0) {
2285 return -1;
2286 }
2287 UChar32 c;
2288 int32_t j = i;
2289 do {
2290 j = fText->moveIndex32(j, -1);
2291 c = fText->char32At(j);
2292 }
2293 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2294 return j;
2295
2296 }
2297
2298
2299 int RBBISentMonkey::moveForward(int i) {
2300 if (i>=fText->length()) {
2301 return fText->length();
2302 }
2303 UChar32 c;
2304 int32_t j = i;
2305 do {
2306 j = fText->moveIndex32(j, 1);
2307 c = cAt(j);
2308 }
2309 while (fFormatSet->contains(c) || fExtendSet->contains(c));
2310 return j;
2311 }
2312
2313 UChar32 RBBISentMonkey::cAt(int pos) {
2314 if (pos<0 || pos>=fText->length()) {
2315 return -1;
2316 } else {
2317 return fText->char32At(pos);
2318 }
2319 }
2320
2321 int32_t RBBISentMonkey::next(int32_t prevPos) {
2322 int p0, p1, p2, p3; // Indices of the significant code points around the
2323 // break position being tested. The candidate break
2324 // location is before p2.
2325
2326 int breakPos = -1;
2327
2328 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2329 UChar32 c;
2330
2331 if (U_FAILURE(deferredStatus)) {
2332 return -1;
2333 }
2334
2335 // Prev break at end of string. return DONE.
2336 if (prevPos >= fText->length()) {
2337 return -1;
2338 }
2339 p0 = p1 = p2 = p3 = prevPos;
2340 c3 = fText->char32At(prevPos);
2341 c0 = c1 = c2 = 0;
2342 (void)p0; // Suppress set but not used warning.
2343
2344 // Loop runs once per "significant" character position in the input text.
2345 for (;;) {
2346 // Move all of the positions forward in the input string.
2347 p0 = p1; c0 = c1;
2348 p1 = p2; c1 = c2;
2349 p2 = p3; c2 = c3;
2350
2351 // Advancd p3 by X(Extend | Format)* Rule 4
2352 p3 = moveForward(p3);
2353 c3 = cAt(p3);
2354
2355 // Rule (3) CR x LF
2356 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2357 continue;
2358 }
2359
2360 // Rule (4). Sep <break>
2361 if (fSepSet->contains(c1)) {
2362 p2 = p1+1; // Separators don't combine with Extend or Format.
2363 break;
2364 }
2365
2366 if (p2 >= fText->length()) {
2367 // Reached end of string. Always a break position.
2368 break;
2369 }
2370
2371 if (p2 == prevPos) {
2372 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2373 continue;
2374 }
2375
2376 // Rule (6). ATerm x Numeric
2377 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) {
2378 continue;
2379 }
2380
2381 // Rule (7). (Upper | Lower) ATerm x Uppper
2382 if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
2383 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2384 continue;
2385 }
2386
2387 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2388 // Note: STerm | ATerm are added to the negated part of the expression by a
2389 // note to the Unicode 5.0 documents.
2390 int p8 = p1;
2391 while (fSpSet->contains(cAt(p8))) {
2392 p8 = moveBack(p8);
2393 }
2394 while (fCloseSet->contains(cAt(p8))) {
2395 p8 = moveBack(p8);
2396 }
2397 if (fATermSet->contains(cAt(p8))) {
2398 p8=p2;
2399 for (;;) {
2400 c = cAt(p8);
2401 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2402 fLowerSet->contains(c) || fSepSet->contains(c) ||
2403 fATermSet->contains(c) || fSTermSet->contains(c)) {
2404 break;
2405 }
2406 p8 = moveForward(p8);
2407 }
2408 if (fLowerSet->contains(cAt(p8))) {
2409 continue;
2410 }
2411 }
2412
2413 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2414 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2415 p8 = p1;
2416 while (fSpSet->contains(cAt(p8))) {
2417 p8 = moveBack(p8);
2418 }
2419 while (fCloseSet->contains(cAt(p8))) {
2420 p8 = moveBack(p8);
2421 }
2422 c = cAt(p8);
2423 if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2424 continue;
2425 }
2426 }
2427
2428 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)
2429 int p9 = p1;
2430 while (fCloseSet->contains(cAt(p9))) {
2431 p9 = moveBack(p9);
2432 }
2433 c = cAt(p9);
2434 if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2435 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2436 continue;
2437 }
2438 }
2439
2440 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)
2441 int p10 = p1;
2442 while (fSpSet->contains(cAt(p10))) {
2443 p10 = moveBack(p10);
2444 }
2445 while (fCloseSet->contains(cAt(p10))) {
2446 p10 = moveBack(p10);
2447 }
2448 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2449 if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2450 continue;
2451 }
2452 }
2453
2454 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break>
2455 int p11 = p1;
2456 if (fSepSet->contains(cAt(p11))) {
2457 p11 = moveBack(p11);
2458 }
2459 while (fSpSet->contains(cAt(p11))) {
2460 p11 = moveBack(p11);
2461 }
2462 while (fCloseSet->contains(cAt(p11))) {
2463 p11 = moveBack(p11);
2464 }
2465 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2466 break;
2467 }
2468
2469 // Rule (12) Any x Any
2470 continue;
2471 }
2472 breakPos = p2;
2473 return breakPos;
2474 }
2475
2476 RBBISentMonkey::~RBBISentMonkey() {
2477 delete fSets;
2478 delete fSepSet;
2479 delete fFormatSet;
2480 delete fSpSet;
2481 delete fLowerSet;
2482 delete fUpperSet;
2483 delete fOLetterSet;
2484 delete fNumericSet;
2485 delete fATermSet;
2486 delete fSContinueSet;
2487 delete fSTermSet;
2488 delete fCloseSet;
2489 delete fOtherSet;
2490 delete fExtendSet;
2491 }
2492
2493
2494
2495 //-------------------------------------------------------------------------------------------
2496 //
2497 // RBBILineMonkey
2498 //
2499 //-------------------------------------------------------------------------------------------
2500
2501 class RBBILineMonkey: public RBBIMonkeyKind {
2502 public:
2503 RBBILineMonkey();
2504 virtual ~RBBILineMonkey();
2505 virtual UVector *charClasses();
2506 virtual void setText(const UnicodeString &s);
2507 virtual int32_t next(int32_t i);
2508 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2509 private:
2510 UVector *fSets;
2511
2512 UnicodeSet *fBK;
2513 UnicodeSet *fCR;
2514 UnicodeSet *fLF;
2515 UnicodeSet *fCM;
2516 UnicodeSet *fNL;
2517 UnicodeSet *fSG;
2518 UnicodeSet *fWJ;
2519 UnicodeSet *fZW;
2520 UnicodeSet *fGL;
2521 UnicodeSet *fCB;
2522 UnicodeSet *fSP;
2523 UnicodeSet *fB2;
2524 UnicodeSet *fBA;
2525 UnicodeSet *fBB;
2526 UnicodeSet *fHY;
2527 UnicodeSet *fH2;
2528 UnicodeSet *fH3;
2529 UnicodeSet *fCL;
2530 UnicodeSet *fCP;
2531 UnicodeSet *fEX;
2532 UnicodeSet *fIN;
2533 UnicodeSet *fJL;
2534 UnicodeSet *fJV;
2535 UnicodeSet *fJT;
2536 UnicodeSet *fNS;
2537 UnicodeSet *fOP;
2538 UnicodeSet *fQU;
2539 UnicodeSet *fIS;
2540 UnicodeSet *fNU;
2541 UnicodeSet *fPO;
2542 UnicodeSet *fPR;
2543 UnicodeSet *fSY;
2544 UnicodeSet *fAI;
2545 UnicodeSet *fAL;
2546 UnicodeSet *fCJ;
2547 UnicodeSet *fHL;
2548 UnicodeSet *fID;
2549 UnicodeSet *fRI;
2550 UnicodeSet *fXX;
2551 UnicodeSet *fEB;
2552 UnicodeSet *fEM;
2553 UnicodeSet *fZJ;
2554
2555 BreakIterator *fCharBI;
2556 const UnicodeString *fText;
2557 RegexMatcher *fNumberMatcher;
2558 };
2559
2560 RBBILineMonkey::RBBILineMonkey() :
2561 RBBIMonkeyKind(),
2562 fSets(NULL),
2563
2564 fCharBI(NULL),
2565 fText(NULL),
2566 fNumberMatcher(NULL)
2567
2568 {
2569 if (U_FAILURE(deferredStatus)) {
2570 return;
2571 }
2572
2573 UErrorCode status = U_ZERO_ERROR;
2574
2575 fSets = new UVector(status);
2576
2577 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2578 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2579 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2580 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2581 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2582 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2583 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2584 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2585 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2586 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2587 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2588 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2589 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2590 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2591 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2592 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2593 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
2594 fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
2595 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2596 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2597 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2598 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2599 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2600 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2601 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
2602 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
2603 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2604 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2605 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2606 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2607 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2608 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2609 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
2610 fCJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2611 fHL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
2612 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
2613 fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
2614 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2615 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
2616 fEB = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=EB}] \\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F91D\\U0001F93C]"), status);
2617 fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
2618 fZJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
2619
2620 if (U_FAILURE(status)) {
2621 deferredStatus = status;
2622 return;
2623 }
2624
2625 fAL->addAll(*fXX); // Default behavior for XX is identical to AL
2626 fAL->addAll(*fAI); // Default behavior for AI is identical to AL
2627 fAL->addAll(*fSG); // Default behavior for SG is identical to AL.
2628
2629 fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS.
2630 fCM->addAll(*fZJ); // ZWJ behaves as a CM.
2631
2632 fSets->addElement(fBK, status);
2633 fSets->addElement(fCR, status);
2634 fSets->addElement(fLF, status);
2635 fSets->addElement(fCM, status);
2636 fSets->addElement(fNL, status);
2637 fSets->addElement(fWJ, status);
2638 fSets->addElement(fZW, status);
2639 fSets->addElement(fGL, status);
2640 fSets->addElement(fCB, status);
2641 fSets->addElement(fSP, status);
2642 fSets->addElement(fB2, status);
2643 fSets->addElement(fBA, status);
2644 fSets->addElement(fBB, status);
2645 fSets->addElement(fHY, status);
2646 fSets->addElement(fH2, status);
2647 fSets->addElement(fH3, status);
2648 fSets->addElement(fCL, status);
2649 fSets->addElement(fCP, status);
2650 fSets->addElement(fEX, status);
2651 fSets->addElement(fIN, status);
2652 fSets->addElement(fJL, status);
2653 fSets->addElement(fJT, status);
2654 fSets->addElement(fJV, status);
2655 fSets->addElement(fNS, status);
2656 fSets->addElement(fOP, status);
2657 fSets->addElement(fQU, status);
2658 fSets->addElement(fIS, status);
2659 fSets->addElement(fNU, status);
2660 fSets->addElement(fPO, status);
2661 fSets->addElement(fPR, status);
2662 fSets->addElement(fSY, status);
2663 fSets->addElement(fAI, status);
2664 fSets->addElement(fAL, status);
2665 fSets->addElement(fHL, status);
2666 fSets->addElement(fID, status);
2667 fSets->addElement(fWJ, status);
2668 fSets->addElement(fRI, status);
2669 fSets->addElement(fSG, status);
2670 fSets->addElement(fEB, status);
2671 fSets->addElement(fEM, status);
2672 fSets->addElement(fZJ, status);
2673
2674
2675 const char *rules =
2676 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
2677 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
2678 "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
2679 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
2680 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
2681 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
2682
2683 fNumberMatcher = new RegexMatcher(
2684 UnicodeString(rules, -1, US_INV), 0, status);
2685
2686 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2687
2688 if (U_FAILURE(status)) {
2689 deferredStatus = status;
2690 }
2691 }
2692
2693
2694 void RBBILineMonkey::setText(const UnicodeString &s) {
2695 fText = &s;
2696 fCharBI->setText(s);
2697 fNumberMatcher->reset(s);
2698 }
2699
2700 //
2701 // rule9Adjust
2702 // Line Break TR rules 9 and 10 implementation.
2703 // This deals with combining marks and other sequences that
2704 // that must be treated as if they were something other than what they actually are.
2705 //
2706 // This is factored out into a separate function because it must be applied twice for
2707 // each potential break, once to the chars before the position being checked, then
2708 // again to the text following the possible break.
2709 //
2710 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2711 if (pos == -1) {
2712 // Invalid initial position. Happens during the warmup iteration of the
2713 // main loop in next().
2714 return;
2715 }
2716
2717 int32_t nPos = *nextPos;
2718
2719 // LB 9 Keep combining sequences together.
2720 // advance over any CM class chars. Note that Line Break CM is different
2721 // from the normal Grapheme Extend property.
2722 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
2723 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
2724 for (;;) {
2725 *nextChar = fText->char32At(nPos);
2726 if (!fCM->contains(*nextChar)) {
2727 break;
2728 }
2729 nPos = fText->moveIndex32(nPos, 1);
2730 }
2731 }
2732
2733
2734 // LB 9 Treat X CM* as if it were x.
2735 // No explicit action required.
2736
2737 // LB 10 Treat any remaining combining mark as AL
2738 if (fCM->contains(*posChar)) {
2739 *posChar = u'A';
2740 }
2741
2742 // Push the updated nextPos and nextChar back to our caller.
2743 // This only makes a difference if posChar got bigger by consuming a
2744 // combining sequence.
2745 *nextPos = nPos;
2746 *nextChar = fText->char32At(nPos);
2747 }
2748
2749
2750
2751 int32_t RBBILineMonkey::next(int32_t startPos) {
2752 UErrorCode status = U_ZERO_ERROR;
2753 int32_t pos; // Index of the char following a potential break position
2754 UChar32 thisChar; // Character at above position "pos"
2755
2756 int32_t prevPos; // Index of the char preceding a potential break position
2757 UChar32 prevChar; // Character at above position. Note that prevChar
2758 // and thisChar may not be adjacent because combining
2759 // characters between them will be ignored.
2760
2761 int32_t prevPosX2; // Second previous character. Wider context for LB21a.
2762 UChar32 prevCharX2;
2763
2764 int32_t nextPos; // Index of the next character following pos.
2765 // Usually skips over combining marks.
2766 int32_t nextCPPos; // Index of the code point following "pos."
2767 // May point to a combining mark.
2768 int32_t tPos; // temp value.
2769 UChar32 c;
2770
2771 if (U_FAILURE(deferredStatus)) {
2772 return -1;
2773 }
2774
2775 if (startPos >= fText->length()) {
2776 return -1;
2777 }
2778
2779
2780 // Initial values for loop. Loop will run the first time without finding breaks,
2781 // while the invalid values shift out and the "this" and
2782 // "prev" positions are filled in with good values.
2783 pos = prevPos = prevPosX2 = -1; // Invalid value, serves as flag for initial loop iteration.
2784 thisChar = prevChar = prevCharX2 = 0;
2785 nextPos = nextCPPos = startPos;
2786
2787
2788 // Loop runs once per position in the test text, until a break position
2789 // is found.
2790 for (;;) {
2791 prevPosX2 = prevPos;
2792 prevCharX2 = prevChar;
2793
2794 prevPos = pos;
2795 prevChar = thisChar;
2796
2797 pos = nextPos;
2798 thisChar = fText->char32At(pos);
2799
2800 nextCPPos = fText->moveIndex32(pos, 1);
2801 nextPos = nextCPPos;
2802
2803 // Rule LB2 - Break at end of text.
2804 if (pos >= fText->length()) {
2805 break;
2806 }
2807
2808 // Rule LB 9 - adjust for combining sequences.
2809 // We do this one out-of-order because the adjustment does not change anything
2810 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
2811 // be applied.
2812 rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
2813 nextCPPos = nextPos = fText->moveIndex32(pos, 1);
2814 c = fText->char32At(nextPos);
2815 rule9Adjust(pos, &thisChar, &nextPos, &c);
2816
2817 // If the loop is still warming up - if we haven't shifted the initial
2818 // -1 positions out of prevPos yet - loop back to advance the
2819 // position in the input without any further looking for breaks.
2820 if (prevPos == -1) {
2821 continue;
2822 }
2823
2824 // LB 4 Always break after hard line breaks,
2825 if (fBK->contains(prevChar)) {
2826 break;
2827 }
2828
2829 // LB 5 Break after CR, LF, NL, but not inside CR LF
2830 if (prevChar == 0x0d && thisChar == 0x0a) {
2831 continue;
2832 }
2833 if (prevChar == 0x0d ||
2834 prevChar == 0x0a ||
2835 prevChar == 0x85) {
2836 break;
2837 }
2838
2839 // LB 6 Don't break before hard line breaks
2840 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
2841 fBK->contains(thisChar)) {
2842 continue;
2843 }
2844
2845
2846 // LB 7 Don't break before spaces or zero-width space.
2847 if (fSP->contains(thisChar)) {
2848 continue;
2849 }
2850
2851 if (fZW->contains(thisChar)) {
2852 continue;
2853 }
2854
2855 // LB 8 Break after zero width space
2856 if (fZW->contains(prevChar)) {
2857 break;
2858 }
2859
2860 // LB 25 Numbers
2861 // Move this test up, before LB8a, because numbers can match a longer sequence that would
2862 // also match 8a. e.g. NU ZWJ IS PO (ZWJ acts like CM)
2863 if (fNumberMatcher->lookingAt(prevPos, status)) {
2864 if (U_FAILURE(status)) {
2865 break;
2866 }
2867 // Matched a number. But could have been just a single digit, which would
2868 // not represent a "no break here" between prevChar and thisChar
2869 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num
2870 if (numEndIdx > pos) {
2871 // Number match includes at least our two chars being checked
2872 if (numEndIdx > nextPos) {
2873 // Number match includes additional chars. Update pos and nextPos
2874 // so that next loop iteration will continue at the end of the number,
2875 // checking for breaks between last char in number & whatever follows.
2876 pos = nextPos = numEndIdx;
2877 do {
2878 pos = fText->moveIndex32(pos, -1);
2879 thisChar = fText->char32At(pos);
2880 } while (fCM->contains(thisChar));
2881 }
2882 continue;
2883 }
2884 }
2885
2886 // LB 8a ZWJ x
2887 // The monkey test's way of ignoring combining characters doesn't work
2888 // for this rule. ZJ is also a CM. Need to get the actual character
2889 // preceding "thisChar", not ignoring combining marks, possibly ZJ.
2890 {
2891 int32_t prevIdx = fText->moveIndex32(pos, -1);
2892 UChar32 prevC = fText->char32At(prevIdx);
2893 if (fZJ->contains(prevC)) {
2894 continue;
2895 }
2896 }
2897
2898 // LB 9, 10 Already done, at top of loop.
2899 //
2900
2901
2902 // LB 11 Do not break before or after WORD JOINER and related characters.
2903 // x WJ
2904 // WJ x
2905 //
2906 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
2907 continue;
2908 }
2909
2910 // LB 12
2911 // GL x
2912 if (fGL->contains(prevChar)) {
2913 continue;
2914 }
2915
2916 // LB 12a
2917 // [^SP BA HY] x GL
2918 if (!(fSP->contains(prevChar) ||
2919 fBA->contains(prevChar) ||
2920 fHY->contains(prevChar) ) && fGL->contains(thisChar)) {
2921 continue;
2922 }
2923
2924
2925
2926 // LB 13 Don't break before closings.
2927 // NU x CL, NU x CP and NU x IS are not matched here so that they will
2928 // fall into LB 17 and the more general number regular expression.
2929 //
2930 if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
2931 (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
2932 fEX->contains(thisChar) ||
2933 (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
2934 (!fNU->contains(prevChar) && fSY->contains(thisChar))) {
2935 continue;
2936 }
2937
2938 // LB 14 Don't break after OP SP*
2939 // Scan backwards, checking for this sequence.
2940 // The OP char could include combining marks, so we actually check for
2941 // OP CM* SP*
2942 // Another Twist: The Rule 67 fixes may have changed a SP CM
2943 // sequence into a ID char, so before scanning back through spaces,
2944 // verify that prevChar is indeed a space. The prevChar variable
2945 // may differ from fText[prevPos]
2946 tPos = prevPos;
2947 if (fSP->contains(prevChar)) {
2948 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
2949 tPos=fText->moveIndex32(tPos, -1);
2950 }
2951 }
2952 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
2953 tPos=fText->moveIndex32(tPos, -1);
2954 }
2955 if (fOP->contains(fText->char32At(tPos))) {
2956 continue;
2957 }
2958
2959
2960 // LB 15 QU SP* x OP
2961 if (fOP->contains(thisChar)) {
2962 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
2963 int tPos = prevPos;
2964 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
2965 tPos = fText->moveIndex32(tPos, -1);
2966 }
2967 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
2968 tPos = fText->moveIndex32(tPos, -1);
2969 }
2970 if (fQU->contains(fText->char32At(tPos))) {
2971 continue;
2972 }
2973 }
2974
2975
2976
2977 // LB 16 (CL | CP) SP* x NS
2978 // Scan backwards for SP* CM* (CL | CP)
2979 if (fNS->contains(thisChar)) {
2980 int tPos = prevPos;
2981 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
2982 tPos = fText->moveIndex32(tPos, -1);
2983 }
2984 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
2985 tPos = fText->moveIndex32(tPos, -1);
2986 }
2987 if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
2988 continue;
2989 }
2990 }
2991
2992
2993 // LB 17 B2 SP* x B2
2994 if (fB2->contains(thisChar)) {
2995 // Scan backwards, checking for the B2 CM* SP* sequence.
2996 tPos = prevPos;
2997 if (fSP->contains(prevChar)) {
2998 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
2999 tPos=fText->moveIndex32(tPos, -1);
3000 }
3001 }
3002 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3003 tPos=fText->moveIndex32(tPos, -1);
3004 }
3005 if (fB2->contains(fText->char32At(tPos))) {
3006 continue;
3007 }
3008 }
3009
3010
3011 // LB 18 break after space
3012 if (fSP->contains(prevChar)) {
3013 break;
3014 }
3015
3016 // LB 19
3017 // x QU
3018 // QU x
3019 if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3020 continue;
3021 }
3022
3023 // LB 20 Break around a CB
3024 if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3025 break;
3026 }
3027
3028 // LB 21
3029 if (fBA->contains(thisChar) ||
3030 fHY->contains(thisChar) ||
3031 fNS->contains(thisChar) ||
3032 fBB->contains(prevChar) ) {
3033 continue;
3034 }
3035
3036 // LB 21a
3037 // HL (HY | BA) x
3038 if (fHL->contains(prevCharX2) &&
3039 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3040 continue;
3041 }
3042
3043 // LB 21b
3044 // SY x HL
3045 if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3046 continue;
3047 }
3048
3049 // LB 22
3050 if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
3051 (fEX->contains(prevChar) && fIN->contains(thisChar)) ||
3052 (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
3053 ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) && fIN->contains(thisChar)) ||
3054 (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
3055 (fNU->contains(prevChar) && fIN->contains(thisChar)) ) {
3056 continue;
3057 }
3058
3059
3060 // LB 23 (AL | HL) x NU
3061 // NU x (AL | HL)
3062 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
3063 continue;
3064 }
3065 if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3066 continue;
3067 }
3068
3069 // LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
3070 // PR x (ID | EB | EM)
3071 // (ID | EB | EM) x PO
3072 if (fPR->contains(prevChar) &&
3073 (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar))) {
3074 continue;
3075 }
3076 if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
3077 fPO->contains(thisChar)) {
3078 continue;
3079 }
3080
3081 // LB 24 Do not break between prefix and letters or ideographs.
3082 // (PR | PO) x (AL | HL)
3083 // (AL | HL) x (PR | PO)
3084 if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
3085 (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3086 continue;
3087 }
3088 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
3089 (fPR->contains(thisChar) || fPO->contains(thisChar))) {
3090 continue;
3091 }
3092
3093 // LB 25 numbers match, moved up, before LB 8a,
3094
3095 // LB 26 Do not break a Korean syllable.
3096 if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3097 fJV->contains(thisChar) ||
3098 fH2->contains(thisChar) ||
3099 fH3->contains(thisChar))) {
3100 continue;
3101 }
3102
3103 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) &&
3104 (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3105 continue;
3106 }
3107
3108 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3109 fJT->contains(thisChar)) {
3110 continue;
3111 }
3112
3113 // LB 27 Treat a Korean Syllable Block the same as ID.
3114 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3115 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3116 fIN->contains(thisChar)) {
3117 continue;
3118 }
3119 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3120 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3121 fPO->contains(thisChar)) {
3122 continue;
3123 }
3124 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3125 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3126 continue;
3127 }
3128
3129
3130
3131 // LB 28 Do not break between alphabetics ("at").
3132 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3133 continue;
3134 }
3135
3136 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g.").
3137 if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3138 continue;
3139 }
3140
3141 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3142 // (AL | NU) x OP
3143 // CP x (AL | NU)
3144 if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
3145 continue;
3146 }
3147 if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3148 continue;
3149 }
3150
3151 // LB30a RI RI <break> RI
3152 // RI x RI
3153 if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
3154 break;
3155 }
3156 if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3157 continue;
3158 }
3159
3160 // LB30b Emoji Base x Emoji Modifier
3161 if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
3162 continue;
3163 }
3164
3165 // LB 31 Break everywhere else
3166 break;
3167
3168 }
3169
3170 return pos;
3171 }
3172
3173
3174 UVector *RBBILineMonkey::charClasses() {
3175 return fSets;
3176 }
3177
3178
3179 RBBILineMonkey::~RBBILineMonkey() {
3180 delete fSets;
3181
3182 delete fBK;
3183 delete fCR;
3184 delete fLF;
3185 delete fCM;
3186 delete fNL;
3187 delete fWJ;
3188 delete fZW;
3189 delete fGL;
3190 delete fCB;
3191 delete fSP;
3192 delete fB2;
3193 delete fBA;
3194 delete fBB;
3195 delete fHY;
3196 delete fH2;
3197 delete fH3;
3198 delete fCL;
3199 delete fCP;
3200 delete fEX;
3201 delete fIN;
3202 delete fJL;
3203 delete fJV;
3204 delete fJT;
3205 delete fNS;
3206 delete fOP;
3207 delete fQU;
3208 delete fIS;
3209 delete fNU;
3210 delete fPO;
3211 delete fPR;
3212 delete fSY;
3213 delete fAI;
3214 delete fAL;
3215 delete fCJ;
3216 delete fHL;
3217 delete fID;
3218 delete fRI;
3219 delete fSG;
3220 delete fXX;
3221 delete fEB;
3222 delete fEM;
3223 delete fZJ;
3224
3225 delete fCharBI;
3226 delete fNumberMatcher;
3227 }
3228
3229
3230 //-------------------------------------------------------------------------------------------
3231 //
3232 // TestMonkey
3233 //
3234 // params
3235 // seed=nnnnn Random number starting seed.
3236 // Setting the seed allows errors to be reproduced.
3237 // loop=nnn Looping count. Controls running time.
3238 // -1: run forever.
3239 // 0 or greater: run length.
3240 //
3241 // type = char | word | line | sent | title
3242 //
3243 // Example:
3244 // intltest rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3245 //
3246 //-------------------------------------------------------------------------------------------
3247
3248 static int32_t getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3249 int32_t val = defaultVal;
3250 name.append(" *= *(-?\\d+)");
3251 UErrorCode status = U_ZERO_ERROR;
3252 RegexMatcher m(name, params, 0, status);
3253 if (m.find()) {
3254 // The param exists. Convert the string to an int.
3255 char valString[100];
3256 int32_t paramLength = m.end(1, status) - m.start(1, status);
3257 if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3258 paramLength = (int32_t)(sizeof(valString)-2);
3259 }
3260 params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3261 val = strtol(valString, NULL, 10);
3262
3263 // Delete this parameter from the params string.
3264 m.reset();
3265 params = m.replaceFirst("", status);
3266 }
3267 U_ASSERT(U_SUCCESS(status));
3268 return val;
3269 }
3270 #endif
3271
3272 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3273 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3274 BreakIterator *bi,
3275 int expected[],
3276 int expectedcount)
3277 {
3278 int count = 0;
3279 int i = 0;
3280 int forward[50];
3281 bi->setText(ustr);
3282 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3283 forward[count] = i;
3284 if (count < expectedcount && expected[count] != i) {
3285 test->errln("%s:%d break forward test failed: expected %d but got %d",
3286 __FILE__, __LINE__, expected[count], i);
3287 break;
3288 }
3289 count ++;
3290 }
3291 if (count != expectedcount) {
3292 printStringBreaks(ustr, expected, expectedcount);
3293 test->errln("%s:%d break forward test failed: missed %d match",
3294 __FILE__, __LINE__, expectedcount - count);
3295 return;
3296 }
3297 // testing boundaries
3298 for (i = 1; i < expectedcount; i ++) {
3299 int j = expected[i - 1];
3300 if (!bi->isBoundary(j)) {
3301 printStringBreaks(ustr, expected, expectedcount);
3302 test->errln("%s:%d isBoundary() failed. Expected boundary at position %d",
3303 __FILE__, __LINE__, j);
3304 return;
3305 }
3306 for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3307 if (bi->isBoundary(j)) {
3308 printStringBreaks(ustr, expected, expectedcount);
3309 test->errln("%s:%d isBoundary() failed. Not expecting boundary at position %d",
3310 __FILE__, __LINE__, j);
3311 return;
3312 }
3313 }
3314 }
3315
3316 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3317 count --;
3318 if (forward[count] != i) {
3319 printStringBreaks(ustr, expected, expectedcount);
3320 test->errln("%s:%d happy break test previous() failed: expected %d but got %d",
3321 __FILE__, __LINE__, forward[count], i);
3322 break;
3323 }
3324 }
3325 if (count != 0) {
3326 printStringBreaks(ustr, expected, expectedcount);
3327 test->errln("break test previous() failed: missed a match");
3328 return;
3329 }
3330
3331 // testing preceding
3332 for (i = 0; i < expectedcount - 1; i ++) {
3333 // int j = expected[i] + 1;
3334 int j = ustr.moveIndex32(expected[i], 1);
3335 for (; j <= expected[i + 1]; j ++) {
3336 int32_t expectedPreceding = expected[i];
3337 int32_t actualPreceding = bi->preceding(j);
3338 if (actualPreceding != expectedPreceding) {
3339 printStringBreaks(ustr, expected, expectedcount);
3340 test->errln("%s:%d preceding(%d): expected %d, got %d",
3341 __FILE__, __LINE__, j, expectedPreceding, actualPreceding);
3342 return;
3343 }
3344 }
3345 }
3346 }
3347 #endif
3348
3349 void RBBITest::TestWordBreaks(void)
3350 {
3351 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3352
3353 Locale locale("en");
3354 UErrorCode status = U_ZERO_ERROR;
3355 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3356 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3357 // Replaced any C+J characters in a row with a random sequence of characters
3358 // of the same length to make our C+J segmentation not get in the way.
3359 static const char *strlist[] =
3360 {
3361 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3362 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3363 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3364 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3365 "\\uac00\\u3588\\u009c\\u0953\\u194b",
3366 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3367 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3368 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3369 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3370 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3371 "\\u2027\\U000e0067\\u0a47\\u00b7",
3372 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3373 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3374 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3375 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3376 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3377 "\\u0027\\u11af\\U000e0057\\u0602",
3378 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3379 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3380 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3381 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3382 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3383 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3384 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3385 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3386 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3387 "\\u18f4\\U000e0049\\u20e7\\u2027",
3388 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3389 "\\ua183\\u102d\\u0bec\\u003a",
3390 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3391 "\\u003a\\u0e57\\u0fad\\u002e",
3392 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3393 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3394 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3395 "\\u003a\\u0664\\u00b7\\u1fba",
3396 "\\u003b\\u0027\\u00b7\\u47a3",
3397 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3398 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3399 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3400 };
3401 int loop;
3402 if (U_FAILURE(status)) {
3403 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3404 return;
3405 }
3406 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3407 // printf("looping %d\n", loop);
3408 UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3409 // RBBICharMonkey monkey;
3410 RBBIWordMonkey monkey;
3411
3412 int expected[50];
3413 int expectedcount = 0;
3414
3415 monkey.setText(ustr);
3416 int i;
3417 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3418 expected[expectedcount ++] = i;
3419 }
3420
3421 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3422 }
3423 delete bi;
3424 #endif
3425 }
3426
3427 void RBBITest::TestWordBoundary(void)
3428 {
3429 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3430 Locale locale("en");
3431 UErrorCode status = U_ZERO_ERROR;
3432 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3433 LocalPointer<BreakIterator> bi(BreakIterator::createWordInstance(locale, status), status);
3434 if (U_FAILURE(status)) {
3435 errcheckln(status, "%s:%d Creation of break iterator failed %s",
3436 __FILE__, __LINE__, u_errorName(status));
3437 return;
3438 }
3439 UChar str[50];
3440 static const char *strlist[] =
3441 {
3442 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3443 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3444 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3445 "\\u2027\\U000e0067\\u0a47\\u00b7",
3446 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3447 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3448 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3449 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3450 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3451 "\\u0027\\u11af\\U000e0057\\u0602",
3452 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3453 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3454 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3455 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3456 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3457 "\\U000e0065\\u302c\\u09ee\\U000e0068",
3458 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3459 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3460 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3461 "\\u58f4\\U000e0049\\u20e7\\u2027",
3462 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3463 "\\ua183\\u102d\\u0bec\\u003a",
3464 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3465 "\\u003a\\u0e57\\u0fad\\u002e",
3466 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3467 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3468 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3469 "\\u003a\\u0664\\u00b7\\u1fba",
3470 "\\u003b\\u0027\\u00b7\\u47a3",
3471 };
3472 int loop;
3473 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3474 u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3475 UnicodeString ustr(str);
3476 int forward[50];
3477 int count = 0;
3478
3479 bi->setText(ustr);
3480 int prev = -1;
3481 for (int32_t boundary = bi->first(); boundary != BreakIterator::DONE; boundary = bi->next()) {
3482 ++count;
3483 if (count >= UPRV_LENGTHOF(forward)) {
3484 errln("%s:%d too many breaks found. (loop, count, boundary) = (%d, %d, %d)",
3485 __FILE__, __LINE__, loop, count, boundary);
3486 return;
3487 }
3488 forward[count] = boundary;
3489 if (boundary <= prev) {
3490 errln("%s:%d bi::next() did not advance. (loop, prev, boundary) = (%d, %d, %d)\n",
3491 __FILE__, __LINE__, loop, prev, boundary);
3492 break;
3493 }
3494 for (int32_t nonBoundary = prev + 1; nonBoundary < boundary; nonBoundary ++) {
3495 if (bi->isBoundary(nonBoundary)) {
3496 printStringBreaks(ustr, forward, count);
3497 errln("%s:%d isBoundary(nonBoundary) failed. (loop, prev, nonBoundary, boundary) = (%d, %d, %d, %d)",
3498 __FILE__, __LINE__, loop, prev, nonBoundary, boundary);
3499 return;
3500 }
3501 }
3502 if (!bi->isBoundary(boundary)) {
3503 printStringBreaks(ustr, forward, count);
3504 errln("%s:%d happy boundary test failed: expected %d a boundary",
3505 __FILE__, __LINE__, boundary);
3506 return;
3507 }
3508 prev = boundary;
3509 }
3510 }
3511 }
3512
3513 void RBBITest::TestLineBreaks(void)
3514 {
3515 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3516 Locale locale("en");
3517 UErrorCode status = U_ZERO_ERROR;
3518 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3519 const int32_t STRSIZE = 50;
3520 UChar str[STRSIZE];
3521 static const char *strlist[] =
3522 {
3523 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3524 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3525 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3526 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3527 "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3528 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3529 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3530 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3531 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3532 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3533 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3534 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3535 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3536 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3537 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3538 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3539 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3540 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3541 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3542 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3543 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3544 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3545 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3546 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3547 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3548 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3549 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3550 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3551 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3552 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3553 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3554 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3555 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3556 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3557 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3558 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3559 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3560 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3561 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3562 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3563 };
3564 int loop;
3565 TEST_ASSERT_SUCCESS(status);
3566 if (U_FAILURE(status)) {
3567 return;
3568 }
3569 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3570 // printf("looping %d\n", loop);
3571 int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3572 if (t >= STRSIZE) {
3573 TEST_ASSERT(FALSE);
3574 continue;
3575 }
3576
3577
3578 UnicodeString ustr(str);
3579 RBBILineMonkey monkey;
3580 if (U_FAILURE(monkey.deferredStatus)) {
3581 continue;
3582 }
3583
3584 const int EXPECTEDSIZE = 50;
3585 int expected[EXPECTEDSIZE];
3586 int expectedcount = 0;
3587
3588 monkey.setText(ustr);
3589 int i;
3590 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3591 if (expectedcount >= EXPECTEDSIZE) {
3592 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3593 return;
3594 }
3595 expected[expectedcount ++] = i;
3596 }
3597
3598 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3599 }
3600 delete bi;
3601 #endif
3602 }
3603
3604 void RBBITest::TestSentBreaks(void)
3605 {
3606 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3607 Locale locale("en");
3608 UErrorCode status = U_ZERO_ERROR;
3609 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3610 UChar str[200];
3611 static const char *strlist[] =
3612 {
3613 "Now\ris\nthe\r\ntime\n\rfor\r\r",
3614 "This\n",
3615 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3616 "\"Sentence ending with a quote.\" Bye.",
3617 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
3618 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3619 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3620 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3621 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3622 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3623 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3624 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3625 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3626 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3627 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3628 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3629 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3630 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3631 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3632 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3633 };
3634 int loop;
3635 if (U_FAILURE(status)) {
3636 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3637 return;
3638 }
3639 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3640 u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3641 UnicodeString ustr(str);
3642
3643 RBBISentMonkey monkey;
3644 if (U_FAILURE(monkey.deferredStatus)) {
3645 continue;
3646 }
3647
3648 const int EXPECTEDSIZE = 50;
3649 int expected[EXPECTEDSIZE];
3650 int expectedcount = 0;
3651
3652 monkey.setText(ustr);
3653 int i;
3654 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3655 if (expectedcount >= EXPECTEDSIZE) {
3656 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3657 return;
3658 }
3659 expected[expectedcount ++] = i;
3660 }
3661
3662 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3663 }
3664 delete bi;
3665 #endif
3666 }
3667
3668 void RBBITest::TestMonkey() {
3669 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3670
3671 UErrorCode status = U_ZERO_ERROR;
3672 int32_t loopCount = 500;
3673 int32_t seed = 1;
3674 UnicodeString breakType = "all";
3675 Locale locale("en");
3676 UBool useUText = FALSE;
3677
3678 if (quick == FALSE) {
3679 loopCount = 10000;
3680 }
3681
3682 if (fTestParams) {
3683 UnicodeString p(fTestParams);
3684 loopCount = getIntParam("loop", p, loopCount);
3685 seed = getIntParam("seed", p, seed);
3686
3687 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3688 if (m.find()) {
3689 breakType = m.group(1, status);
3690 m.reset();
3691 p = m.replaceFirst("", status);
3692 }
3693
3694 RegexMatcher u(" *utext", p, 0, status);
3695 if (u.find()) {
3696 useUText = TRUE;
3697 u.reset();
3698 p = u.replaceFirst("", status);
3699 }
3700
3701
3702 // m.reset(p);
3703 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
3704 // Each option is stripped out of the option string as it is processed.
3705 // All options have been checked. The option string should have been completely emptied..
3706 char buf[100];
3707 p.extract(buf, sizeof(buf), NULL, status);
3708 buf[sizeof(buf)-1] = 0;
3709 errln("Unrecognized or extra parameter: %s\n", buf);
3710 return;
3711 }
3712
3713 }
3714
3715 if (breakType == "char" || breakType == "all") {
3716 RBBICharMonkey m;
3717 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3718 if (U_SUCCESS(status)) {
3719 RunMonkey(bi, m, "char", seed, loopCount, useUText);
3720 if (breakType == "all" && useUText==FALSE) {
3721 // Also run a quick test with UText when "all" is specified
3722 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
3723 }
3724 }
3725 else {
3726 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
3727 }
3728 delete bi;
3729 }
3730
3731 if (breakType == "word" || breakType == "all") {
3732 logln("Word Break Monkey Test");
3733 RBBIWordMonkey m;
3734 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3735 if (U_SUCCESS(status)) {
3736 RunMonkey(bi, m, "word", seed, loopCount, useUText);
3737 }
3738 else {
3739 errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
3740 }
3741 delete bi;
3742 }
3743
3744 if (breakType == "line" || breakType == "all") {
3745 logln("Line Break Monkey Test");
3746 RBBILineMonkey m;
3747 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3748 if (loopCount >= 10) {
3749 loopCount = loopCount / 5; // Line break runs slower than the others.
3750 }
3751 if (U_SUCCESS(status)) {
3752 RunMonkey(bi, m, "line", seed, loopCount, useUText);
3753 }
3754 else {
3755 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3756 }
3757 delete bi;
3758 }
3759
3760 if (breakType == "sent" || breakType == "all" ) {
3761 logln("Sentence Break Monkey Test");
3762 RBBISentMonkey m;
3763 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3764 if (loopCount >= 10) {
3765 loopCount = loopCount / 10; // Sentence runs slower than the other break types
3766 }
3767 if (U_SUCCESS(status)) {
3768 RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
3769 }
3770 else {
3771 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3772 }
3773 delete bi;
3774 }
3775
3776 #endif
3777 }
3778
3779 //
3780 // Run a RBBI monkey test. Common routine, for all break iterator types.
3781 // Parameters:
3782 // bi - the break iterator to use
3783 // mk - MonkeyKind, abstraction for obtaining expected results
3784 // name - Name of test (char, word, etc.) for use in error messages
3785 // seed - Seed for starting random number generator (parameter from user)
3786 // numIterations
3787 //
3788 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed,
3789 int32_t numIterations, UBool useUText) {
3790
3791 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3792
3793 const int32_t TESTSTRINGLEN = 500;
3794 UnicodeString testText;
3795 int32_t numCharClasses;
3796 UVector *chClasses;
3797 int expected[TESTSTRINGLEN*2 + 1];
3798 int expectedCount = 0;
3799 char expectedBreaks[TESTSTRINGLEN*2 + 1];
3800 char forwardBreaks[TESTSTRINGLEN*2 + 1];
3801 char reverseBreaks[TESTSTRINGLEN*2+1];
3802 char isBoundaryBreaks[TESTSTRINGLEN*2+1];
3803 char followingBreaks[TESTSTRINGLEN*2+1];
3804 char precedingBreaks[TESTSTRINGLEN*2+1];
3805 int i;
3806 int loopCount = 0;
3807
3808 m_seed = seed;
3809
3810 numCharClasses = mk.charClasses()->size();
3811 chClasses = mk.charClasses();
3812
3813 // Check for errors that occured during the construction of the MonkeyKind object.
3814 // Can't report them where they occured because errln() is a method coming from intlTest,
3815 // and is not visible outside of RBBITest :-(
3816 if (U_FAILURE(mk.deferredStatus)) {
3817 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
3818 return;
3819 }
3820
3821 // Verify that the character classes all have at least one member.
3822 for (i=0; i<numCharClasses; i++) {
3823 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
3824 if (s == NULL || s->size() == 0) {
3825 errln("Character Class #%d is null or of zero size.", i);
3826 return;
3827 }
3828 }
3829
3830 while (loopCount < numIterations || numIterations == -1) {
3831 if (numIterations == -1 && loopCount % 10 == 0) {
3832 // If test is running in an infinite loop, display a periodic tic so
3833 // we can tell that it is making progress.
3834 fprintf(stderr, ".");
3835 }
3836 // Save current random number seed, so that we can recreate the random numbers
3837 // for this loop iteration in event of an error.
3838 seed = m_seed;
3839
3840 // Populate a test string with data.
3841 testText.truncate(0);
3842 for (i=0; i<TESTSTRINGLEN; i++) {
3843 int32_t aClassNum = m_rand() % numCharClasses;
3844 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
3845 int32_t charIdx = m_rand() % classSet->size();
3846 UChar32 c = classSet->charAt(charIdx);
3847 if (c < 0) { // TODO: deal with sets containing strings.
3848 errln("%s:%d c < 0", __FILE__, __LINE__);
3849 break;
3850 }
3851 // Do not assemble a supplementary character from randomly generated separate surrogates.
3852 // (It could be a dictionary character)
3853 if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
3854 continue;
3855 }
3856
3857 testText.append(c);
3858 }
3859
3860 // Calculate the expected results for this test string.
3861 mk.setText(testText);
3862 memset(expectedBreaks, 0, sizeof(expectedBreaks));
3863 expectedBreaks[0] = 1;
3864 int32_t breakPos = 0;
3865 expectedCount = 0;
3866 for (;;) {
3867 breakPos = mk.next(breakPos);
3868 if (breakPos == -1) {
3869 break;
3870 }
3871 if (breakPos > testText.length()) {
3872 errln("breakPos > testText.length()");
3873 }
3874 expectedBreaks[breakPos] = 1;
3875 U_ASSERT(expectedCount<testText.length());
3876 expected[expectedCount ++] = breakPos;
3877 (void)expected; // Set but not used warning.
3878 // TODO (andy): check it out.
3879 }
3880
3881 // Find the break positions using forward iteration
3882 memset(forwardBreaks, 0, sizeof(forwardBreaks));
3883 if (useUText) {
3884 UErrorCode status = U_ZERO_ERROR;
3885 UText *testUText = utext_openReplaceable(NULL, &testText, &status);
3886 // testUText = utext_openUnicodeString(testUText, &testText, &status);
3887 bi->setText(testUText, status);
3888 TEST_ASSERT_SUCCESS(status);
3889 utext_close(testUText); // The break iterator does a shallow clone of the UText
3890 // This UText can be closed immediately, so long as the
3891 // testText string continues to exist.
3892 } else {
3893 bi->setText(testText);
3894 }
3895
3896 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
3897 if (i < 0 || i > testText.length()) {
3898 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
3899 break;
3900 }
3901 forwardBreaks[i] = 1;
3902 }
3903
3904 // Find the break positions using reverse iteration
3905 memset(reverseBreaks, 0, sizeof(reverseBreaks));
3906 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
3907 if (i < 0 || i > testText.length()) {
3908 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
3909 break;
3910 }
3911 reverseBreaks[i] = 1;
3912 }
3913
3914 // Find the break positions using isBoundary() tests.
3915 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
3916 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
3917 for (i=0; i<=testText.length(); i++) {
3918 isBoundaryBreaks[i] = bi->isBoundary(i);
3919 }
3920
3921
3922 // Find the break positions using the following() function.
3923 // printf(".");
3924 memset(followingBreaks, 0, sizeof(followingBreaks));
3925 int32_t lastBreakPos = 0;
3926 followingBreaks[0] = 1;
3927 for (i=0; i<testText.length(); i++) {
3928 breakPos = bi->following(i);
3929 if (breakPos <= i ||
3930 breakPos < lastBreakPos ||
3931 breakPos > testText.length() ||
3932 (breakPos > lastBreakPos && lastBreakPos > i)) {
3933 errln("%s break monkey test: "
3934 "Out of range value returned by BreakIterator::following().\n"
3935 "Random seed=%d index=%d; following returned %d; lastbreak=%d",
3936 name, seed, i, breakPos, lastBreakPos);
3937 break;
3938 }
3939 followingBreaks[breakPos] = 1;
3940 lastBreakPos = breakPos;
3941 }
3942
3943 // Find the break positions using the preceding() function.
3944 memset(precedingBreaks, 0, sizeof(precedingBreaks));
3945 lastBreakPos = testText.length();
3946 precedingBreaks[testText.length()] = 1;
3947 for (i=testText.length(); i>0; i--) {
3948 breakPos = bi->preceding(i);
3949 if (breakPos >= i ||
3950 breakPos > lastBreakPos ||
3951 (breakPos < 0 && testText.getChar32Start(i)>0) ||
3952 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
3953 errln("%s break monkey test: "
3954 "Out of range value returned by BreakIterator::preceding().\n"
3955 "index=%d; prev returned %d; lastBreak=%d" ,
3956 name, i, breakPos, lastBreakPos);
3957 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
3958 precedingBreaks[i] = 2; // Forces an error.
3959 }
3960 } else {
3961 if (breakPos >= 0) {
3962 precedingBreaks[breakPos] = 1;
3963 }
3964 lastBreakPos = breakPos;
3965 }
3966 }
3967
3968 // Compare the expected and actual results.
3969 for (i=0; i<=testText.length(); i++) {
3970 const char *errorType = NULL;
3971 if (forwardBreaks[i] != expectedBreaks[i]) {
3972 errorType = "next()";
3973 } else if (reverseBreaks[i] != forwardBreaks[i]) {
3974 errorType = "previous()";
3975 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
3976 errorType = "isBoundary()";
3977 } else if (followingBreaks[i] != expectedBreaks[i]) {
3978 errorType = "following()";
3979 } else if (precedingBreaks[i] != expectedBreaks[i]) {
3980 errorType = "preceding()";
3981 }
3982
3983
3984 if (errorType != NULL) {
3985 // Format a range of the test text that includes the failure as
3986 // a data item that can be included in the rbbi test data file.
3987
3988 // Start of the range is the last point where expected and actual results
3989 // both agreed that there was a break position.
3990 int startContext = i;
3991 int32_t count = 0;
3992 for (;;) {
3993 if (startContext==0) { break; }
3994 startContext --;
3995 if (expectedBreaks[startContext] != 0) {
3996 if (count == 2) break;
3997 count ++;
3998 }
3999 }
4000
4001 // End of range is two expected breaks past the start position.
4002 int endContext = i + 1;
4003 int ci;
4004 for (ci=0; ci<2; ci++) { // Number of items to include in error text.
4005 for (;;) {
4006 if (endContext >= testText.length()) {break;}
4007 if (expectedBreaks[endContext-1] != 0) {
4008 if (count == 0) break;
4009 count --;
4010 }
4011 endContext ++;
4012 }
4013 }
4014
4015 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4016 UnicodeString errorText = "<data>";
4017 /***if (strcmp(errorType, "next()") == 0) {
4018 startContext = 0;
4019 endContext = testText.length();
4020
4021 printStringBreaks(testText, expected, expectedCount);
4022 }***/
4023
4024 for (ci=startContext; ci<endContext;) {
4025 UnicodeString hexChars("0123456789abcdef");
4026 UChar32 c;
4027 int bn;
4028 c = testText.char32At(ci);
4029 if (ci == i) {
4030 // This is the location of the error.
4031 errorText.append("<?>");
4032 } else if (expectedBreaks[ci] != 0) {
4033 // This a non-error expected break position.
4034 errorText.append("\\");
4035 }
4036 if (c < 0x10000) {
4037 errorText.append("\\u");
4038 for (bn=12; bn>=0; bn-=4) {
4039 errorText.append(hexChars.charAt((c>>bn)&0xf));
4040 }
4041 } else {
4042 errorText.append("\\U");
4043 for (bn=28; bn>=0; bn-=4) {
4044 errorText.append(hexChars.charAt((c>>bn)&0xf));
4045 }
4046 }
4047 ci = testText.moveIndex32(ci, 1);
4048 }
4049 errorText.append("\\");
4050 errorText.append("</data>\n");
4051
4052 // Output the error
4053 char charErrorTxt[500];
4054 UErrorCode status = U_ZERO_ERROR;
4055 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4056 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4057 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
4058
4059 errln("%s break monkey test error [%s]. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s",
4060 name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
4061 errorType, seed, i, charErrorTxt);
4062 break;
4063 }
4064 }
4065
4066 loopCount++;
4067 }
4068 #endif
4069 }
4070
4071
4072 // Bug 5532. UTF-8 based UText fails in dictionary code.
4073 // This test checks the initial patch,
4074 // which is to just keep it from crashing. Correct word boundaries
4075 // await a proper fix to the dictionary code.
4076 //
4077 void RBBITest::TestBug5532(void) {
4078 // Text includes a mixture of Thai and Latin.
4079 const unsigned char utf8Data[] = {
4080 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4081 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4082 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4083 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4084 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4085 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4086 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4087 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4088 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4089 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4090 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4091
4092 UErrorCode status = U_ZERO_ERROR;
4093 UText utext=UTEXT_INITIALIZER;
4094 utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4095 TEST_ASSERT_SUCCESS(status);
4096
4097 BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4098 TEST_ASSERT_SUCCESS(status);
4099 if (U_SUCCESS(status)) {
4100 bi->setText(&utext, status);
4101 TEST_ASSERT_SUCCESS(status);
4102
4103 int32_t breakCount = 0;
4104 int32_t previousBreak = -1;
4105 for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4106 // For now, just make sure that the break iterator doesn't hang.
4107 TEST_ASSERT(previousBreak < bi->current());
4108 previousBreak = bi->current();
4109 }
4110 TEST_ASSERT(breakCount > 0);
4111 }
4112 delete bi;
4113 utext_close(&utext);
4114 }
4115
4116
4117 void RBBITest::TestBug9983(void) {
4118 UnicodeString text = UnicodeString("\\u002A" // * Other
4119 "\\uFF65" // Other
4120 "\\u309C" // Katakana
4121 "\\uFF9F" // Extend
4122 "\\uFF65" // Other
4123 "\\u0020" // Other
4124 "\\u0000").unescape();
4125
4126 UErrorCode status = U_ZERO_ERROR;
4127 LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4128 BreakIterator::createWordInstance(Locale::getRoot(), status)));
4129 TEST_ASSERT_SUCCESS(status);
4130 LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4131 BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4132 TEST_ASSERT_SUCCESS(status);
4133 if (U_FAILURE(status)) {
4134 return;
4135 }
4136 int32_t offset, rstatus, iterationCount;
4137
4138 brkiter->setText(text);
4139 brkiter->last();
4140 iterationCount = 0;
4141 while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4142 iterationCount++;
4143 rstatus = brkiter->getRuleStatus();
4144 (void)rstatus; // Suppress set but not used warning.
4145 if (iterationCount >= 10) {
4146 break;
4147 }
4148 }
4149 TEST_ASSERT(iterationCount == 6);
4150
4151 brkiterPOSIX->setText(text);
4152 brkiterPOSIX->last();
4153 iterationCount = 0;
4154 while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4155 iterationCount++;
4156 rstatus = brkiterPOSIX->getRuleStatus();
4157 (void)rstatus; // Suppress set but not used warning.
4158 if (iterationCount >= 10) {
4159 break;
4160 }
4161 }
4162 TEST_ASSERT(iterationCount == 6);
4163 }
4164
4165 // Bug 7547 - verify that building a break itereator from empty rules produces an error.
4166 //
4167 void RBBITest::TestBug7547() {
4168 UnicodeString rules;
4169 UErrorCode status = U_ZERO_ERROR;
4170 UParseError parseError;
4171 RuleBasedBreakIterator breakIterator(rules, parseError, status);
4172 if (status != U_BRK_RULE_SYNTAX) {
4173 errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status));
4174 }
4175 if (parseError.line != 1 || parseError.offset != 0) {
4176 errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset);
4177 }
4178 }
4179
4180
4181 void RBBITest::TestBug12797() {
4182 UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
4183 UErrorCode status = U_ZERO_ERROR;
4184 UParseError parseError;
4185 RuleBasedBreakIterator bi(rules, parseError, status);
4186 if (U_FAILURE(status)) {
4187 errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status));
4188 return;
4189 }
4190 UnicodeString text = "abc";
4191 bi.setText(text);
4192 bi.first();
4193 int32_t boundary = bi.next();
4194 if (boundary != 3) {
4195 errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary);
4196 }
4197 }
4198
4199 void RBBITest::TestBug12918() {
4200 // This test triggers an assertion failure in dictbe.cpp
4201 const UChar *crasherString = u"\u3325\u4a16";
4202 UErrorCode status = U_ZERO_ERROR;
4203 UBreakIterator* iter = ubrk_open(UBRK_WORD, NULL, crasherString, -1, &status);
4204 if (U_FAILURE(status)) {
4205 dataerrln("%s:%d status = %s", __FILE__, __LINE__, u_errorName(status));
4206 return;
4207 }
4208 ubrk_first(iter);
4209 int32_t pos = 0;
4210 int32_t lastPos = -1;
4211 while((pos = ubrk_next(iter)) != UBRK_DONE) {
4212 if (pos <= lastPos) {
4213 errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__, __LINE__, pos, lastPos);
4214 break;
4215 }
4216 }
4217 ubrk_close(iter);
4218 }
4219
4220 void RBBITest::TestBug12932() {
4221 // Node Stack overflow in the RBBI rule parser caused a seg fault.
4222 UnicodeString ruleStr(
4223 "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4224 "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4225 "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((()))"
4226 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4227 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4228 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))");
4229
4230 UErrorCode status = U_ZERO_ERROR;
4231 UParseError parseError;
4232 RuleBasedBreakIterator rbbi(ruleStr, parseError, status);
4233 if (status != U_BRK_RULE_SYNTAX) {
4234 errln("%s:%d expected U_BRK_RULE_SYNTAX, got %s",
4235 __FILE__, __LINE__, u_errorName(status));
4236 }
4237 }
4238
4239
4240 // Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt
4241 // remain undevided by ICU char, word and line break.
4242 void RBBITest::TestEmoji() {
4243 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4244 UErrorCode status = U_ZERO_ERROR;
4245
4246 CharString testFileName;
4247 testFileName.append(IntlTest::getSourceTestData(status), status);
4248 testFileName.appendPathPart("emoji-test.txt", status);
4249 if (U_FAILURE(status)) {
4250 errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4251 return;
4252 }
4253 logln("Opening data file %s\n", testFileName.data());
4254
4255 int len;
4256 UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
4257 if (U_FAILURE(status) || testFile == NULL) {
4258 errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4259 return;
4260 }
4261 UnicodeString testFileAsString(testFile, len);
4262 delete [] testFile;
4263
4264 RegexMatcher lineMatcher(u"^.*?$", testFileAsString, UREGEX_MULTILINE, status);
4265 RegexMatcher hexMatcher(u"\\s*([a-f0-9]*)", UREGEX_CASE_INSENSITIVE, status);
4266 // hexMatcher group(1) is a hex number, or empty string if no hex number present.
4267 int32_t lineNumber = 0;
4268
4269 LocalPointer<BreakIterator> charBreaks(BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4270 LocalPointer<BreakIterator> wordBreaks(BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4271 LocalPointer<BreakIterator> lineBreaks(BreakIterator::createLineInstance(Locale::getEnglish(), status), status);
4272 if (U_FAILURE(status)) {
4273 dataerrln("%s:%d %s while opening break iterators", __FILE__, __LINE__, u_errorName(status));
4274 return;
4275 }
4276
4277 while (lineMatcher.find()) {
4278 ++lineNumber;
4279 UnicodeString line = lineMatcher.group(status);
4280 hexMatcher.reset(line);
4281 UnicodeString testString; // accumulates the emoji sequence.
4282 while (hexMatcher.find() && hexMatcher.group(1, status).length() > 0) {
4283 UnicodeString hex = hexMatcher.group(1, status);
4284 if (hex.length() > 8) {
4285 errln("%s:%d emoji-test.txt:%d invalid code point %s", __FILE__, __LINE__, lineNumber, CStr(hex)());
4286 break;
4287 }
4288 CharString hex8;
4289 hex8.appendInvariantChars(hex, status);
4290 UChar32 c = (UChar32)strtol(hex8.data(), NULL, 16);
4291 if (c<=0x10ffff) {
4292 testString.append(c);
4293 } else {
4294 errln("%s:%d emoji-test.txt:%d Error: Unicode Character %s value out of range.",
4295 __FILE__, __LINE__, lineNumber, hex8.data());
4296 break;
4297 }
4298 }
4299
4300 if (testString.length() > 1) {
4301 charBreaks->setText(testString);
4302 charBreaks->first();
4303 int32_t firstBreak = charBreaks->next();
4304 if (testString.length() != firstBreak) {
4305 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4306 __FILE__, __LINE__, lineNumber, firstBreak);
4307 }
4308 wordBreaks->setText(testString);
4309 wordBreaks->first();
4310 firstBreak = wordBreaks->next();
4311 if (testString.length() != firstBreak) {
4312 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4313 __FILE__, __LINE__, lineNumber, firstBreak);
4314 }
4315 lineBreaks->setText(testString);
4316 lineBreaks->first();
4317 firstBreak = lineBreaks->next();
4318 if (testString.length() != firstBreak) {
4319 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4320 __FILE__, __LINE__, lineNumber, firstBreak);
4321 }
4322 }
4323 }
4324 #endif
4325 }
4326
4327
4328 // TestBug12519 - Correct handling of Locales by assignment / copy / clone
4329
4330 // WHERE Macro yields a literal string of the form "source_file_name:line number "
4331 // TODO: propose something equivalent as a test framework addition.
4332
4333 #define WHERE __FILE__ ":" XLINE(__LINE__) " "
4334 #define XLINE(s) LINE(s)
4335 #define LINE(s) #s
4336
4337 void RBBITest::TestBug12519() {
4338 UErrorCode status = U_ZERO_ERROR;
4339 LocalPointer<RuleBasedBreakIterator> biEn((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4340 LocalPointer<RuleBasedBreakIterator> biFr((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getFrance(), status));
4341 if (!assertSuccess(WHERE, status)) {
4342 dataerrln("%s %d status = %s", __FILE__, __LINE__, u_errorName(status));
4343 return;
4344 }
4345 assertTrue(WHERE, Locale::getEnglish() == biEn->getLocale(ULOC_VALID_LOCALE, status));
4346
4347 assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status));
4348 assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr);
4349
4350 LocalPointer<RuleBasedBreakIterator>cloneEn((RuleBasedBreakIterator *)biEn->clone());
4351 assertTrue(WHERE, *biEn == *cloneEn);
4352 assertTrue(WHERE, Locale::getEnglish() == cloneEn->getLocale(ULOC_VALID_LOCALE, status));
4353
4354 LocalPointer<RuleBasedBreakIterator>cloneFr((RuleBasedBreakIterator *)biFr->clone());
4355 assertTrue(WHERE, *biFr == *cloneFr);
4356 assertTrue(WHERE, Locale::getFrench() == cloneFr->getLocale(ULOC_VALID_LOCALE, status));
4357
4358 LocalPointer<RuleBasedBreakIterator>biDe((RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getGerman(), status));
4359 UnicodeString text("Hallo Welt");
4360 biDe->setText(text);
4361 assertTrue(WHERE "before assignment of \"biDe = biFr\", they should be different, but are equal.", *biFr != *biDe);
4362 *biDe = *biFr;
4363 assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe);
4364 }
4365
4366 void RBBITest::TestBug12677() {
4367 // Check that stripping of comments from rules for getRules() is not confused by
4368 // the presence of '#' characters in the rules that do not introduce comments.
4369 UnicodeString rules(u"!!forward; \n"
4370 "$x = [ab#]; # a set with a # literal. \n"
4371 " # .; # a comment that looks sort of like a rule. \n"
4372 " '#' '?'; # a rule with a quoted # \n"
4373 );
4374
4375 UErrorCode status = U_ZERO_ERROR;
4376 UParseError pe;
4377 RuleBasedBreakIterator bi(rules, pe, status);
4378 assertSuccess(WHERE, status);
4379 UnicodeString rtRules = bi.getRules();
4380 assertEquals(WHERE, UnicodeString(u"!!forward; $x = [ab#]; '#' '?'; "), rtRules);
4381 }
4382
4383
4384 void RBBITest::TestTableRedundancies() {
4385 UErrorCode status = U_ZERO_ERROR;
4386
4387 LocalPointer<RuleBasedBreakIterator> bi (
4388 (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status));
4389 assertSuccess(WHERE, status);
4390 if (U_FAILURE(status)) return;
4391
4392 RBBIDataWrapper *dw = bi->fData;
4393 const RBBIStateTable *fwtbl = dw->fForwardTable;
4394 int32_t numCharClasses = dw->fHeader->fCatCount;
4395 // printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates);
4396
4397 // Check for duplicate columns (character categories)
4398
4399 std::vector<UnicodeString> columns;
4400 for (int32_t column = 0; column < numCharClasses; column++) {
4401 UnicodeString s;
4402 for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) {
4403 RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4404 s.append(row->fNextState[column]);
4405 }
4406 columns.push_back(s);
4407 }
4408 // Ignore column (char class) 0 while checking; it's special, and may have duplicates.
4409 for (int c1=1; c1<numCharClasses; c1++) {
4410 for (int c2 = c1+1; c2 < numCharClasses; c2++) {
4411 if (columns.at(c1) == columns.at(c2)) {
4412 errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__, __LINE__, c1, c2);
4413 goto out;
4414 }
4415 }
4416 }
4417 out:
4418
4419 // Check for duplicate states
4420 std::vector<UnicodeString> rows;
4421 for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) {
4422 UnicodeString s;
4423 RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4424 assertTrue(WHERE, row->fAccepting >= -1);
4425 s.append(row->fAccepting + 1); // values of -1 are expected.
4426 s.append(row->fLookAhead);
4427 s.append(row->fTagIdx);
4428 for (int32_t column = 0; column < numCharClasses; column++) {
4429 s.append(row->fNextState[column]);
4430 }
4431 rows.push_back(s);
4432 }
4433 for (int r1=0; r1 < (int32_t)fwtbl->fNumStates; r1++) {
4434 for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) {
4435 if (rows.at(r1) == rows.at(r2)) {
4436 errln("%s:%d Duplicate rows (%d, %d)\n", __FILE__, __LINE__, r1, r2);
4437 return;
4438 }
4439 }
4440 }
4441 }
4442
4443 // Bug 13447: verify that getRuleStatus() returns the value corresponding to current(),
4444 // even after next() has returned DONE.
4445
4446 void RBBITest::TestBug13447() {
4447 UErrorCode status = U_ZERO_ERROR;
4448 LocalPointer<RuleBasedBreakIterator> bi(
4449 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4450 assertSuccess(WHERE, status);
4451 if (U_FAILURE(status)) return;
4452 UnicodeString data(u"1234");
4453 bi->setText(data);
4454 assertEquals(WHERE, UBRK_WORD_NONE, bi->getRuleStatus());
4455 assertEquals(WHERE, 4, bi->next());
4456 assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4457 assertEquals(WHERE, UBRK_DONE, bi->next());
4458 assertEquals(WHERE, 4, bi->current());
4459 assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4460 }
4461
4462 // TestReverse exercises both the synthesized safe reverse rules and the logic
4463 // for filling the break iterator cache when starting from random positions
4464 // in the text.
4465 //
4466 // It's a monkey test, working on random data, with the expected data obtained
4467 // from forward iteration (no safe rules involved), comparing with results
4468 // when indexing into the interior of the string (safe rules needed).
4469
4470 void RBBITest::TestReverse() {
4471 UErrorCode status = U_ZERO_ERROR;
4472
4473 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4474 BreakIterator::createCharacterInstance(Locale::getEnglish(), status)));
4475 assertSuccess(WHERE, status, true);
4476 status = U_ZERO_ERROR;
4477 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4478 BreakIterator::createWordInstance(Locale::getEnglish(), status)));
4479 assertSuccess(WHERE, status, true);
4480 status = U_ZERO_ERROR;
4481 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4482 BreakIterator::createLineInstance(Locale::getEnglish(), status)));
4483 assertSuccess(WHERE, status, true);
4484 status = U_ZERO_ERROR;
4485 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4486 BreakIterator::createSentenceInstance(Locale::getEnglish(), status)));
4487 assertSuccess(WHERE, status, true);
4488 }
4489
4490 void RBBITest::TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi) {
4491 if (!bi) {
4492 return;
4493 }
4494
4495 // From the mapping trie in the break iterator's internal data, create a
4496 // vector of UnicodeStrings, one for each character category, containing
4497 // all of the code points that map to that category. Unicode planes 0 and 1 only,
4498 // to avoid an execess of unassigned code points.
4499
4500 RBBIDataWrapper *data = bi->fData;
4501 int32_t categoryCount = data->fHeader->fCatCount;
4502 UTrie2 *trie = data->fTrie;
4503
4504 std::vector<UnicodeString> strings(categoryCount, UnicodeString());
4505 for (int cp=0; cp<0x1fff0; ++cp) {
4506 int cat = utrie2_get32(trie, cp);
4507 cat &= ~0x4000; // And off the dictionary bit from the category.
4508 assertTrue(WHERE, cat < categoryCount && cat >= 0);
4509 if (cat < 0 || cat >= categoryCount) return;
4510 strings[cat].append(cp);
4511 }
4512
4513 icu_rand randomGen;
4514 const int testStringLength = 10000;
4515 UnicodeString testString;
4516
4517 for (int i=0; i<testStringLength; ++i) {
4518 int charClass = randomGen() % categoryCount;
4519 if (strings[charClass].length() > 0) {
4520 int cp = strings[charClass].char32At(randomGen() % strings[charClass].length());
4521 testString.append(cp);
4522 }
4523 }
4524
4525 typedef std::pair<UBool, int32_t> Result;
4526 std::vector<Result> expectedResults;
4527 bi->setText(testString);
4528 for (int i=0; i<testString.length(); ++i) {
4529 bool isboundary = bi->isBoundary(i);
4530 int ruleStatus = bi->getRuleStatus();
4531 expectedResults.push_back(std::make_pair(isboundary, ruleStatus));
4532 }
4533
4534 for (int i=testString.length()-1; i>=0; --i) {
4535 bi->setText(testString); // clears the internal break cache
4536 Result expected = expectedResults[i];
4537 assertEquals(WHERE, expected.first, bi->isBoundary(i));
4538 assertEquals(WHERE, expected.second, bi->getRuleStatus());
4539 }
4540 }
4541
4542
4543 // Ticket 13692 - finding word boundaries in very large numbers or words could
4544 // be very time consuming. When the problem was present, this void test
4545 // would run more than fifteen minutes, which is to say, the failure was noticeale.
4546
4547 void RBBITest::TestBug13692() {
4548 UErrorCode status = U_ZERO_ERROR;
4549 LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4550 BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4551 if (!assertSuccess(WHERE, status, true)) {
4552 return;
4553 }
4554 constexpr int32_t LENGTH = 1000000;
4555 UnicodeString longNumber(LENGTH, (UChar32)u'3', LENGTH);
4556 for (int i=0; i<20; i+=2) {
4557 longNumber.setCharAt(i, u' ');
4558 }
4559 bi->setText(longNumber);
4560 assertFalse(WHERE, bi->isBoundary(LENGTH-5));
4561 assertSuccess(WHERE, status);
4562 }
4563
4564 //
4565 // TestDebug - A place-holder test for debugging purposes.
4566 // For putting in fragments of other tests that can be invoked
4567 // for tracing without a lot of unwanted extra stuff happening.
4568 //
4569 void RBBITest::TestDebug(void) {
4570 UErrorCode status = U_ZERO_ERROR;
4571 LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4572 BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4573 if (!assertSuccess(WHERE, status, true)) {
4574 return;
4575 }
4576 const UnicodeString &rules = bi->getRules();
4577 UParseError pe;
4578 LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
4579 assertSuccess(WHERE, status);
4580 }
4581
4582 void RBBITest::TestProperties() {
4583 UErrorCode errorCode = U_ZERO_ERROR;
4584 UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4585 if (!prependSet.isEmpty()) {
4586 errln(
4587 "[:GCB=Prepend:] is not empty any more. "
4588 "Uncomment relevant lines in source/data/brkitr/char.txt and "
4589 "change this test to the opposite condition.");
4590 }
4591 }
4592
4593 #endif // #if !UCONFIG_NO_BREAK_ITERATION