]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/intltest/rbbitst.cpp
ICU-66108.tar.gz
[apple/icu.git] / icuSources / test / intltest / rbbitst.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * COPYRIGHT:
5 * Copyright (c) 1999-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
8 /************************************************************************
9 * Date Name Description
10 * 12/15/99 Madhu Creation.
11 * 01/12/2000 Madhu Updated for changed API and added new tests
12 ************************************************************************/
13
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_BREAK_ITERATION
16
17 #include <stdio.h>
18 #include <stdlib.h>
19 #include <string.h>
20 #include <utility>
21 #include <vector>
22
23 #include "unicode/brkiter.h"
24 #include "unicode/localpointer.h"
25 #include "unicode/numfmt.h"
26 #include "unicode/rbbi.h"
27 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
28 #include "unicode/regex.h"
29 #endif
30 #include "unicode/schriter.h"
31 #include "unicode/uchar.h"
32 #include "unicode/utf16.h"
33 #include "unicode/ucnv.h"
34 #include "unicode/uniset.h"
35 #include "unicode/uscript.h"
36 #include "unicode/ustring.h"
37 #include "unicode/utext.h"
38
39 #include "charstr.h"
40 #include "cmemory.h"
41 #include "cstr.h"
42 #include "intltest.h"
43 #include "rbbitst.h"
44 #include "rbbidata.h"
45 #include "utypeinfo.h" // for 'typeid' to work
46 #include "uvector.h"
47 #include "uvectr32.h"
48
49 // Needed for Apple perf tests <rdar://problem/51193810>
50 #include <unistd.h>
51 #include <mach/mach_time.h>
52
53
54 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
55 #include "unicode/filteredbrk.h"
56 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
57
58 #define TEST_ASSERT(x) UPRV_BLOCK_MACRO_BEGIN { \
59 if (!(x)) { \
60 errln("Failure in file %s, line %d", __FILE__, __LINE__); \
61 } \
62 } UPRV_BLOCK_MACRO_END
63
64 #define TEST_ASSERT_SUCCESS(errcode) UPRV_BLOCK_MACRO_BEGIN { \
65 if (U_FAILURE(errcode)) { \
66 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode)); \
67 } \
68 } UPRV_BLOCK_MACRO_END
69
70 #define MONKEY_ERROR(msg, fRuleFileName, index, seed) { \
71 IntlTest::gTest->errln("\n%s:%d %s at index %d. Parameters to reproduce: @\"type=%s seed=%u loop=1\"", \
72 __FILE__, __LINE__, msg, index, fRuleFileName, seed); \
73 }
74
75 //---------------------------------------------
76 // runIndexedTest
77 //---------------------------------------------
78
79
80 // Note: Before adding new tests to this file, check whether the desired test data can
81 // simply be added to the file testdata/rbbitest.txt. In most cases it can,
82 // it's much less work than writing a new test, diagnostic output in the event of failures
83 // is good, and the test data file will is shared with ICU4J, so eventually the test
84 // will run there as well, without additional effort.
85
86 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
87 {
88 if (exec) logln("TestSuite RuleBasedBreakIterator: ");
89 fTestParams = params;
90
91 TESTCASE_AUTO_BEGIN;
92 #if !UCONFIG_NO_FILE_IO
93 TESTCASE_AUTO(TestBug4153072);
94 #endif
95 #if !UCONFIG_NO_FILE_IO
96 TESTCASE_AUTO(TestUnicodeFiles);
97 #endif
98 TESTCASE_AUTO(TestGetAvailableLocales);
99 TESTCASE_AUTO(TestGetDisplayName);
100 #if !UCONFIG_NO_FILE_IO
101 TESTCASE_AUTO(TestEndBehaviour);
102 TESTCASE_AUTO(TestWordBreaks);
103 TESTCASE_AUTO(TestWordBoundary);
104 TESTCASE_AUTO(TestLineBreaks);
105 TESTCASE_AUTO(TestSentBreaks);
106 TESTCASE_AUTO(TestExtended);
107 #endif
108 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
109 TESTCASE_AUTO(TestMonkey);
110 #endif
111 #if !UCONFIG_NO_FILE_IO
112 TESTCASE_AUTO(TestBug3818);
113 #endif
114 TESTCASE_AUTO(TestDebug);
115 #if !UCONFIG_NO_FILE_IO
116 TESTCASE_AUTO(TestBug5775);
117 #endif
118 TESTCASE_AUTO(TestBug9983);
119 TESTCASE_AUTO(TestDictRules);
120 TESTCASE_AUTO(TestBug5532);
121 TESTCASE_AUTO(TestBug7547);
122 TESTCASE_AUTO(TestBug12797);
123 TESTCASE_AUTO(TestBug12918);
124 TESTCASE_AUTO(TestBug12932);
125 TESTCASE_AUTO(TestEmoji);
126 TESTCASE_AUTO(TestBug12519);
127 TESTCASE_AUTO(TestBug12677);
128 TESTCASE_AUTO(TestTableRedundancies);
129 TESTCASE_AUTO(TestBug13447);
130 TESTCASE_AUTO(TestReverse);
131 TESTCASE_AUTO(TestBug13692);
132 TESTCASE_AUTO_END;
133 }
134
135
136 //--------------------------------------------------------------------------------------
137 //
138 // RBBITest constructor and destructor
139 //
140 //--------------------------------------------------------------------------------------
141
142 RBBITest::RBBITest() {
143 fTestParams = NULL;
144 }
145
146
147 RBBITest::~RBBITest() {
148 }
149
150
151 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
152 UErrorCode status = U_ZERO_ERROR;
153 char name[100];
154 printf("code alpha extend alphanum type word sent line name\n");
155 int nextExpectedIndex = 0;
156 utext_setNativeIndex(tstr, 0);
157 for (int j = 0; j < static_cast<int>(utext_nativeLength(tstr)); j=static_cast<int>(utext_getNativeIndex(tstr))) {
158 if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
159 printf("------------------------------------------------ %d\n", j);
160 ++nextExpectedIndex;
161 }
162
163 UChar32 c = utext_next32(tstr);
164 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
165 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
166 u_isUAlphabetic(c),
167 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
168 u_isalnum(c),
169 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
170 u_charType(c),
171 U_SHORT_PROPERTY_NAME),
172 u_getPropertyValueName(UCHAR_WORD_BREAK,
173 u_getIntPropertyValue(c,
174 UCHAR_WORD_BREAK),
175 U_SHORT_PROPERTY_NAME),
176 u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
177 u_getIntPropertyValue(c,
178 UCHAR_SENTENCE_BREAK),
179 U_SHORT_PROPERTY_NAME),
180 u_getPropertyValueName(UCHAR_LINE_BREAK,
181 u_getIntPropertyValue(c,
182 UCHAR_LINE_BREAK),
183 U_SHORT_PROPERTY_NAME),
184 name);
185 }
186 }
187
188
189 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
190 UErrorCode status = U_ZERO_ERROR;
191 UText *tstr = NULL;
192 tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
193 if (U_FAILURE(status)) {
194 printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
195 return;
196 }
197 printStringBreaks(tstr, expected, expectedCount);
198 utext_close(tstr);
199 }
200
201
202 void RBBITest::TestBug3818() {
203 UErrorCode status = U_ZERO_ERROR;
204
205 // Four Thai words...
206 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
207 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
208 UnicodeString thaiStr(thaiWordData);
209
210 BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
211 if (U_FAILURE(status) || bi == NULL) {
212 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
213 return;
214 }
215 bi->setText(thaiStr);
216
217 int32_t startOfSecondWord = bi->following(1);
218 if (startOfSecondWord != 4) {
219 errln("Fail at file %s, line %d expected start of word at 4, got %d",
220 __FILE__, __LINE__, startOfSecondWord);
221 }
222 startOfSecondWord = bi->following(0);
223 if (startOfSecondWord != 4) {
224 errln("Fail at file %s, line %d expected start of word at 4, got %d",
225 __FILE__, __LINE__, startOfSecondWord);
226 }
227 delete bi;
228 }
229
230
231 //---------------------------------------------
232 //
233 // other tests
234 //
235 //---------------------------------------------
236
237 void RBBITest::TestGetAvailableLocales()
238 {
239 int32_t locCount = 0;
240 const Locale* locList = BreakIterator::getAvailableLocales(locCount);
241
242 if (locCount == 0)
243 dataerrln("getAvailableLocales() returned an empty list!");
244 // Just make sure that it's returning good memory.
245 int32_t i;
246 for (i = 0; i < locCount; ++i) {
247 logln(locList[i].getName());
248 }
249 }
250
251 //Testing the BreakIterator::getDisplayName() function
252 void RBBITest::TestGetDisplayName()
253 {
254 UnicodeString result;
255
256 BreakIterator::getDisplayName(Locale::getUS(), result);
257 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
258 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
259 + result);
260
261 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
262 if (result != "French (France)")
263 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
264 + result);
265 }
266 /**
267 * Test End Behaviour
268 * @bug 4068137
269 */
270 void RBBITest::TestEndBehaviour()
271 {
272 UErrorCode status = U_ZERO_ERROR;
273 UnicodeString testString("boo.");
274 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
275 if (U_FAILURE(status))
276 {
277 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
278 return;
279 }
280 wb->setText(testString);
281
282 if (wb->first() != 0)
283 errln("Didn't get break at beginning of string.");
284 if (wb->next() != 3)
285 errln("Didn't get break before period in \"boo.\"");
286 if (wb->current() != 4 && wb->next() != 4)
287 errln("Didn't get break at end of string.");
288 delete wb;
289 }
290 /*
291 * @bug 4153072
292 */
293 void RBBITest::TestBug4153072() {
294 UErrorCode status = U_ZERO_ERROR;
295 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
296 if (U_FAILURE(status))
297 {
298 errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
299 return;
300 }
301 UnicodeString str("...Hello, World!...");
302 int32_t begin = 3;
303 int32_t end = str.length() - 3;
304 UBool onBoundary;
305
306 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
307 iter->adoptText(textIterator);
308 int index;
309 // Note: with the switch to UText, there is no way to restrict the
310 // iteration range to begin at an index other than zero.
311 // String character iterators created with a non-zero bound are
312 // treated by RBBI as being empty.
313 for (index = -1; index < begin + 1; ++index) {
314 onBoundary = iter->isBoundary(index);
315 if (index == 0? !onBoundary : onBoundary) {
316 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
317 " and begin index = " + begin);
318 }
319 }
320 delete iter;
321 }
322
323
324 //
325 // Test for problem reported by Ashok Matoria on 9 July 2007
326 // One.<kSoftHyphen><kSpace>Two.
327 //
328 // Sentence break at start (0) and then on calling next() it breaks at
329 // 'T' of "Two". Now, at this point if I do next() and
330 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
331 //
332 void RBBITest::TestBug5775() {
333 UErrorCode status = U_ZERO_ERROR;
334 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
335 TEST_ASSERT_SUCCESS(status);
336 if (U_FAILURE(status)) {
337 return;
338 }
339 // Check for status first for better handling of no data errors.
340 TEST_ASSERT(bi != NULL);
341 if (bi == NULL) {
342 return;
343 }
344
345 UnicodeString s("One.\\u00ad Two.", -1, US_INV);
346 // 01234 56789
347 s = s.unescape();
348 bi->setText(s);
349 int pos = bi->next();
350 TEST_ASSERT(pos == 6);
351 pos = bi->next();
352 TEST_ASSERT(pos == 10);
353 pos = bi->previous();
354 TEST_ASSERT(pos == 6);
355 delete bi;
356 }
357
358
359
360 //------------------------------------------------------------------------------
361 //
362 // RBBITest::Extended Run RBBI Tests from an external test data file
363 //
364 //------------------------------------------------------------------------------
365
366 struct TestParams {
367 BreakIterator *bi; // Break iterator is set while parsing test source.
368 // Changed out whenever test data changes break type.
369
370 UnicodeString dataToBreak; // Data that is built up while parsing the test.
371 UVector32 *expectedBreaks; // Expected break positions, matches dataToBreak UnicodeString.
372 UVector32 *srcLine; // Positions in source file, indexed same as dataToBreak.
373 UVector32 *srcCol;
374
375 UText *textToBreak; // UText, could be UTF8 or UTF16.
376 UVector32 *textMap; // Map from UTF-16 dataToBreak offsets to UText offsets.
377 CharString utf8String; // UTF-8 form of text to break.
378
379 TestParams(UErrorCode &status) : dataToBreak() {
380 bi = NULL;
381 expectedBreaks = new UVector32(status);
382 srcLine = new UVector32(status);
383 srcCol = new UVector32(status);
384 textToBreak = NULL;
385 textMap = new UVector32(status);
386 }
387
388 ~TestParams() {
389 delete bi;
390 delete expectedBreaks;
391 delete srcLine;
392 delete srcCol;
393 utext_close(textToBreak);
394 delete textMap;
395 }
396
397 int32_t getSrcLine(int32_t bp);
398 int32_t getExpectedBreak(int32_t bp);
399 int32_t getSrcCol(int32_t bp);
400
401 void setUTF16(UErrorCode &status);
402 void setUTF8(UErrorCode &status);
403 };
404
405 // Append a UnicodeString to a CharString with UTF-8 encoding.
406 // Substitute any invalid chars.
407 // Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
408 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
409 if (U_FAILURE(status)) {
410 return;
411 }
412 int32_t utf8Length;
413 u_strToUTF8WithSub(NULL, 0, &utf8Length, // Output Buffer, NULL for preflight.
414 src.getBuffer(), src.length(), // UTF-16 data
415 0xfffd, NULL, // Substitution char, number of subs.
416 &status);
417 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
418 return;
419 }
420 status = U_ZERO_ERROR;
421 int32_t capacity;
422 char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
423 u_strToUTF8WithSub(buffer, utf8Length, NULL,
424 src.getBuffer(), src.length(),
425 0xfffd, NULL, &status);
426 dest.append(buffer, utf8Length, status);
427 }
428
429
430 void TestParams::setUTF16(UErrorCode &status) {
431 textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
432 textMap->removeAllElements();
433 for (int32_t i=0; i<dataToBreak.length(); i++) {
434 if (i == dataToBreak.getChar32Start(i)) {
435 textMap->addElement(i, status);
436 } else {
437 textMap->addElement(-1, status);
438 }
439 }
440 textMap->addElement(dataToBreak.length(), status);
441 U_ASSERT(dataToBreak.length() + 1 == textMap->size());
442 }
443
444
445 void TestParams::setUTF8(UErrorCode &status) {
446 if (U_FAILURE(status)) {
447 return;
448 }
449 utf8String.clear();
450 CharStringAppend(utf8String, dataToBreak, status);
451 textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
452 if (U_FAILURE(status)) {
453 return;
454 }
455
456 textMap->removeAllElements();
457 int32_t utf16Index = 0;
458 for (;;) {
459 textMap->addElement(utf16Index, status);
460 UChar32 c32 = utext_current32(textToBreak);
461 if (c32 < 0) {
462 break;
463 }
464 utf16Index += U16_LENGTH(c32);
465 utext_next32(textToBreak);
466 while (textMap->size() < utext_getNativeIndex(textToBreak)) {
467 textMap->addElement(-1, status);
468 }
469 }
470 U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
471 }
472
473
474 int32_t TestParams::getSrcLine(int32_t bp) {
475 if (bp >= textMap->size()) {
476 bp = textMap->size() - 1;
477 }
478 int32_t i = 0;
479 for(; bp >= 0 ; --bp) {
480 // Move to a character boundary if we are not on one already.
481 i = textMap->elementAti(bp);
482 if (i >= 0) {
483 break;
484 }
485 }
486 return srcLine->elementAti(i);
487 }
488
489
490 int32_t TestParams::getExpectedBreak(int32_t bp) {
491 if (bp >= textMap->size()) {
492 return 0;
493 }
494 int32_t i = textMap->elementAti(bp);
495 int32_t retVal = 0;
496 if (i >= 0) {
497 retVal = expectedBreaks->elementAti(i);
498 }
499 return retVal;
500 }
501
502
503 int32_t TestParams::getSrcCol(int32_t bp) {
504 if (bp >= textMap->size()) {
505 bp = textMap->size() - 1;
506 }
507 int32_t i = 0;
508 for(; bp >= 0; --bp) {
509 // Move bp to a character boundary if we are not on one already.
510 i = textMap->elementAti(bp);
511 if (i >= 0) {
512 break;
513 }
514 }
515 return srcCol->elementAti(i);
516 }
517
518
519 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
520 int32_t bp;
521 int32_t prevBP;
522 int32_t i;
523
524 TEST_ASSERT_SUCCESS(status);
525 if (U_FAILURE(status)) {
526 return;
527 }
528
529 if (t->bi == NULL) {
530 return;
531 }
532
533 t->bi->setText(t->textToBreak, status);
534 //
535 // Run the iterator forward
536 //
537 prevBP = -1;
538 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
539 if (prevBP == bp) {
540 // Fail for lack of forward progress.
541 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
542 bp, t->getSrcLine(bp), t->getSrcCol(bp));
543 break;
544 }
545
546 // Check that there we didn't miss an expected break between the last one
547 // and this one.
548 for (i=prevBP+1; i<bp; i++) {
549 if (t->getExpectedBreak(i) != 0) {
550 int expected[] = {0, i};
551 printStringBreaks(t->dataToBreak, expected, 2);
552 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
553 i, t->getSrcLine(i), t->getSrcCol(i));
554 }
555 }
556
557 // Check that the break we did find was expected
558 if (t->getExpectedBreak(bp) == 0) {
559 int expected[] = {0, bp};
560 printStringBreaks(t->textToBreak, expected, 2);
561 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
562 bp, t->getSrcLine(bp), t->getSrcCol(bp));
563 } else {
564 // The break was expected.
565 // Check that the {nnn} tag value is correct.
566 int32_t expectedTagVal = t->getExpectedBreak(bp);
567 if (expectedTagVal == -1) {
568 expectedTagVal = 0;
569 }
570 int32_t line = t->getSrcLine(bp);
571 int32_t rs = t->bi->getRuleStatus();
572 if (rs != expectedTagVal) {
573 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
574 " Actual, Expected status = %4d, %4d",
575 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
576 }
577 }
578
579 prevBP = bp;
580 }
581
582 // Verify that there were no missed expected breaks after the last one found
583 for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
584 if (t->getExpectedBreak(i) != 0) {
585 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
586 i, t->getSrcLine(i), t->getSrcCol(i));
587 }
588 }
589
590 //
591 // Run the iterator backwards, verify that the same breaks are found.
592 //
593 prevBP = static_cast<int32_t>(utext_nativeLength(t->textToBreak) + 2); // start with a phony value for the last break pos seen.
594 bp = t->bi->last();
595 while (bp != BreakIterator::DONE) {
596 if (prevBP == bp) {
597 // Fail for lack of progress.
598 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
599 bp, t->getSrcLine(bp), t->getSrcCol(bp));
600 break;
601 }
602
603 // Check that we didn't miss an expected break between the last one
604 // and this one. (UVector returns zeros for index out of bounds.)
605 for (i=prevBP-1; i>bp; i--) {
606 if (t->getExpectedBreak(i) != 0) {
607 errln("Reverse Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
608 i, t->getSrcLine(i), t->getSrcCol(i));
609 }
610 }
611
612 // Check that the break we did find was expected
613 if (t->getExpectedBreak(bp) == 0) {
614 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
615 bp, t->getSrcLine(bp), t->getSrcCol(bp));
616 } else {
617 // The break was expected.
618 // Check that the {nnn} tag value is correct.
619 int32_t expectedTagVal = t->getExpectedBreak(bp);
620 if (expectedTagVal == -1) {
621 expectedTagVal = 0;
622 }
623 int line = t->getSrcLine(bp);
624 int32_t rs = t->bi->getRuleStatus();
625 if (rs != expectedTagVal) {
626 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
627 " Actual, Expected status = %4d, %4d",
628 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
629 }
630 }
631
632 prevBP = bp;
633 bp = t->bi->previous();
634 }
635
636 // Verify that there were no missed breaks prior to the last one found
637 for (i=prevBP-1; i>=0; i--) {
638 if (t->getExpectedBreak(i) != 0) {
639 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
640 i, t->getSrcLine(i), t->getSrcCol(i));
641 }
642 }
643
644 // Check isBoundary()
645 for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
646 UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
647 UBool boundaryFound = t->bi->isBoundary(i);
648 if (boundaryExpected != boundaryFound) {
649 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
650 " Expected, Actual= %s, %s",
651 i, t->getSrcLine(i), t->getSrcCol(i),
652 boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
653 }
654 }
655
656 // Check following()
657 for (i=0; i < static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i++) {
658 int32_t actualBreak = t->bi->following(i);
659 int32_t expectedBreak = BreakIterator::DONE;
660 for (int32_t j=i+1; j <= static_cast<int32_t>(utext_nativeLength(t->textToBreak)); j++) {
661 if (t->getExpectedBreak(j) != 0) {
662 expectedBreak = j;
663 break;
664 }
665 }
666 if (expectedBreak != actualBreak) {
667 errln("following(%d) incorrect. File line,col= %4d,%4d\n"
668 " Expected, Actual= %d, %d",
669 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
670 }
671 }
672
673 // Check preceding()
674 for (i=static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i>=0; i--) {
675 int32_t actualBreak = t->bi->preceding(i);
676 int32_t expectedBreak = BreakIterator::DONE;
677
678 // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
679 // preceding(trailing byte) will return the index of some preceding code point,
680 // not the lead byte of the current code point, even though that has a smaller index.
681 // Therefore, start looking at the expected break data not at i-1, but at
682 // the start of code point index - 1.
683 utext_setNativeIndex(t->textToBreak, i);
684 int32_t j = static_cast<int32_t>(utext_getNativeIndex(t->textToBreak) - 1);
685 for (; j >= 0; j--) {
686 if (t->getExpectedBreak(j) != 0) {
687 expectedBreak = j;
688 break;
689 }
690 }
691 if (expectedBreak != actualBreak) {
692 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
693 " Expected, Actual= %d, %d",
694 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
695 }
696 }
697 }
698
699
700 void RBBITest::TestExtended() {
701 // Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This
702 // data driven test closely entangles filtered and regular data.
703 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION
704 UErrorCode status = U_ZERO_ERROR;
705 Locale locale("");
706
707 TestParams tp(status);
708
709 RegexMatcher localeMatcher(UnicodeString(u"<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
710 if (U_FAILURE(status)) {
711 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
712 }
713
714 //
715 // Open and read the test data file.
716 //
717 const char *testDataDirectory = IntlTest::getSourceTestData(status);
718 CharString testFileName(testDataDirectory, -1, status);
719 testFileName.append("rbbitst.txt", -1, status);
720
721 int len;
722 UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
723 if (U_FAILURE(status)) {
724 errln("%s:%d Error %s opening file rbbitst.txt", __FILE__, __LINE__, u_errorName(status));
725 return;
726 }
727
728 bool skipTest = false; // Skip this test?
729
730 //
731 // Put the test data into a UnicodeString
732 //
733 UnicodeString testString(FALSE, testFile, len);
734
735 enum EParseState{
736 PARSE_COMMENT,
737 PARSE_TAG,
738 PARSE_DATA,
739 PARSE_NUM,
740 PARSE_RULES
741 }
742 parseState = PARSE_TAG;
743
744 EParseState savedState = PARSE_TAG;
745
746 int32_t lineNum = 1;
747 int32_t colStart = 0;
748 int32_t column = 0;
749 int32_t charIdx = 0;
750
751 int32_t tagValue = 0; // The numeric value of a <nnn> tag.
752
753 UnicodeString rules; // Holds rules from a <rules> ... </rules> block
754 int32_t rulesFirstLine = 0; // Line number of the start of current <rules> block
755
756 // <rdar://problem/51193810>
757 mach_timebase_info_data_t info;
758 uint64_t start, durationOpen = 0.0, durationUse = 0.0;
759 mach_timebase_info(&info);
760 UBool isLine = FALSE;
761
762 for (charIdx = 0; charIdx < len; ) {
763 status = U_ZERO_ERROR;
764 UChar c = testString.charAt(charIdx);
765 charIdx++;
766 if (c == u'\r' && charIdx<len && testString.charAt(charIdx) == u'\n') {
767 // treat CRLF as a unit
768 c = u'\n';
769 charIdx++;
770 }
771 if (c == u'\n' || c == u'\r') {
772 lineNum++;
773 colStart = charIdx;
774 }
775 column = charIdx - colStart + 1;
776
777 switch (parseState) {
778 case PARSE_COMMENT:
779 if (c == u'\n' || c == u'\r') {
780 parseState = savedState;
781 }
782 break;
783
784 case PARSE_TAG:
785 {
786 if (c == u'#') {
787 parseState = PARSE_COMMENT;
788 savedState = PARSE_TAG;
789 break;
790 }
791 if (u_isUWhiteSpace(c)) {
792 break;
793 }
794 if (testString.compare(charIdx-1, 6, u"<word>") == 0) {
795 delete tp.bi;
796 tp.bi = BreakIterator::createWordInstance(locale, status);
797 skipTest = false;
798 charIdx += 5;
799 isLine = FALSE;
800 break;
801 }
802 if (testString.compare(charIdx-1, 6, u"<char>") == 0) {
803 delete tp.bi;
804 tp.bi = BreakIterator::createCharacterInstance(locale, status);
805 skipTest = false;
806 charIdx += 5;
807 isLine = FALSE;
808 break;
809 }
810 if (testString.compare(charIdx-1, 6, u"<line>") == 0) {
811 delete tp.bi;
812 start = mach_absolute_time(); // <rdar://problem/51193810>
813 tp.bi = BreakIterator::createLineInstance(locale, status);
814 durationOpen += (((mach_absolute_time() - start) * info.numer)/info.denom);
815 skipTest = false;
816 charIdx += 5;
817 isLine = TRUE;
818 break;
819 }
820 if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
821 delete tp.bi;
822 tp.bi = BreakIterator::createSentenceInstance(locale, status);
823 skipTest = false;
824 charIdx += 5;
825 break;
826 }
827 if (testString.compare(charIdx-1, 7, u"<title>") == 0) {
828 delete tp.bi;
829 tp.bi = BreakIterator::createTitleInstance(locale, status);
830 charIdx += 6;
831 isLine = FALSE;
832 break;
833 }
834
835 if (testString.compare(charIdx-1, 7, u"<rules>") == 0 ||
836 testString.compare(charIdx-1, 10, u"<badrules>") == 0) {
837 charIdx = testString.indexOf(u'>', charIdx) + 1;
838 parseState = PARSE_RULES;
839 rules.remove();
840 rulesFirstLine = lineNum;
841 isLine = FALSE;
842 break;
843 }
844
845 // <locale loc_name>
846 localeMatcher.reset(testString);
847 if (localeMatcher.lookingAt(charIdx-1, status)) {
848 UnicodeString localeName = localeMatcher.group(1, status);
849 char localeName8[100];
850 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
851 locale = Locale::createFromName(localeName8);
852 charIdx += localeMatcher.group(0, status).length() - 1;
853 TEST_ASSERT_SUCCESS(status);
854 break;
855 }
856 if (testString.compare(charIdx-1, 6, u"<data>") == 0) {
857 parseState = PARSE_DATA;
858 charIdx += 5;
859 tp.dataToBreak = "";
860 tp.expectedBreaks->removeAllElements();
861 tp.srcCol ->removeAllElements();
862 tp.srcLine->removeAllElements();
863 break;
864 }
865
866 errln("line %d: Tag expected in test file.", lineNum);
867 parseState = PARSE_COMMENT;
868 savedState = PARSE_DATA;
869 goto end_test; // Stop the test.
870 }
871 break;
872
873 case PARSE_RULES:
874 if (testString.compare(charIdx-1, 8, u"</rules>") == 0) {
875 charIdx += 7;
876 parseState = PARSE_TAG;
877 delete tp.bi;
878 UParseError pe;
879 tp.bi = new RuleBasedBreakIterator(rules, pe, status);
880 skipTest = U_FAILURE(status);
881 if (U_FAILURE(status)) {
882 errln("file rbbitst.txt: %d - Error %s creating break iterator from rules.",
883 rulesFirstLine + pe.line - 1, u_errorName(status));
884 }
885 } else if (testString.compare(charIdx-1, 11, u"</badrules>") == 0) {
886 charIdx += 10;
887 parseState = PARSE_TAG;
888 UErrorCode ec = U_ZERO_ERROR;
889 UParseError pe;
890 RuleBasedBreakIterator bi(rules, pe, ec);
891 if (U_SUCCESS(ec)) {
892 errln("file rbbitst.txt: %d - Expected, but did not get, a failure creating break iterator from rules.",
893 rulesFirstLine + pe.line - 1);
894 }
895 } else {
896 rules.append(c);
897 }
898 break;
899
900 case PARSE_DATA:
901 if (c == u'\u2022') { // u'•'
902 int32_t breakIdx = tp.dataToBreak.length();
903 tp.expectedBreaks->setSize(breakIdx+1);
904 tp.expectedBreaks->setElementAt(-1, breakIdx);
905 tp.srcLine->setSize(breakIdx+1);
906 tp.srcLine->setElementAt(lineNum, breakIdx);
907 tp.srcCol ->setSize(breakIdx+1);
908 tp.srcCol ->setElementAt(column, breakIdx);
909 break;
910 }
911
912 if (testString.compare(charIdx-1, 7, u"</data>") == 0) {
913 // Add final entry to mappings from break location to source file position.
914 // Need one extra because last break position returned is after the
915 // last char in the data, not at the last char.
916 tp.srcLine->addElement(lineNum, status);
917 tp.srcCol ->addElement(column, status);
918
919 parseState = PARSE_TAG;
920 charIdx += 6;
921
922 if (!skipTest) {
923 // RUN THE TEST!
924 status = U_ZERO_ERROR;
925 tp.setUTF16(status);
926 start = mach_absolute_time(); // <rdar://problem/51193810>
927 executeTest(&tp, status);
928 if (isLine) {
929 durationUse += (((mach_absolute_time() - start) * info.numer)/info.denom);
930 }
931 TEST_ASSERT_SUCCESS(status);
932
933 // Run again, this time with UTF-8 text wrapped in a UText.
934 status = U_ZERO_ERROR;
935 tp.setUTF8(status);
936 TEST_ASSERT_SUCCESS(status);
937 executeTest(&tp, status);
938 }
939 break;
940 }
941
942 if (testString.compare(charIdx-1, 3, u"\\N{") == 0) {
943 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
944 // Get the code point from the name and insert it into the test data.
945 // (Damn, no API takes names in Unicode !!!
946 // we've got to take it back to char *)
947 int32_t nameEndIdx = testString.indexOf(u'}', charIdx);
948 int32_t nameLength = nameEndIdx - (charIdx+2);
949 char charNameBuf[200];
950 UChar32 theChar = -1;
951 if (nameEndIdx != -1) {
952 UErrorCode status = U_ZERO_ERROR;
953 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
954 charNameBuf[sizeof(charNameBuf)-1] = 0;
955 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
956 if (U_FAILURE(status)) {
957 theChar = -1;
958 }
959 }
960 if (theChar == -1) {
961 errln("Error in named character in test file at line %d, col %d",
962 lineNum, column);
963 } else {
964 // Named code point was recognized. Insert it
965 // into the test data.
966 tp.dataToBreak.append(theChar);
967 while (tp.dataToBreak.length() > tp.srcLine->size()) {
968 tp.srcLine->addElement(lineNum, status);
969 tp.srcCol ->addElement(column, status);
970 }
971 }
972 if (nameEndIdx > charIdx) {
973 charIdx = nameEndIdx+1;
974
975 }
976 break;
977 }
978
979
980
981 if (testString.compare(charIdx-1, 2, u"<>") == 0) {
982 charIdx++;
983 int32_t breakIdx = tp.dataToBreak.length();
984 tp.expectedBreaks->setSize(breakIdx+1);
985 tp.expectedBreaks->setElementAt(-1, breakIdx);
986 tp.srcLine->setSize(breakIdx+1);
987 tp.srcLine->setElementAt(lineNum, breakIdx);
988 tp.srcCol ->setSize(breakIdx+1);
989 tp.srcCol ->setElementAt(column, breakIdx);
990 break;
991 }
992
993 if (c == u'<') {
994 tagValue = 0;
995 parseState = PARSE_NUM;
996 break;
997 }
998
999 if (c == u'#' && column==3) { // TODO: why is column off so far?
1000 parseState = PARSE_COMMENT;
1001 savedState = PARSE_DATA;
1002 break;
1003 }
1004
1005 if (c == u'\\') {
1006 // Check for \ at end of line, a line continuation.
1007 // Advance over (discard) the newline
1008 UChar32 cp = testString.char32At(charIdx);
1009 if (cp == u'\r' && charIdx<len && testString.charAt(charIdx+1) == u'\n') {
1010 // We have a CR LF
1011 // Need an extra increment of the input ptr to move over both of them
1012 charIdx++;
1013 }
1014 if (cp == u'\n' || cp == u'\r') {
1015 lineNum++;
1016 colStart = charIdx;
1017 charIdx++;
1018 break;
1019 }
1020
1021 // Let unescape handle the back slash.
1022 cp = testString.unescapeAt(charIdx);
1023 if (cp != -1) {
1024 // Escape sequence was recognized. Insert the char
1025 // into the test data.
1026 tp.dataToBreak.append(cp);
1027 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1028 tp.srcLine->addElement(lineNum, status);
1029 tp.srcCol ->addElement(column, status);
1030 }
1031 break;
1032 }
1033
1034
1035 // Not a recognized backslash escape sequence.
1036 // Take the next char as a literal.
1037 // TODO: Should this be an error?
1038 c = testString.charAt(charIdx);
1039 charIdx = testString.moveIndex32(charIdx, 1);
1040 }
1041
1042 // Normal, non-escaped data char.
1043 tp.dataToBreak.append(c);
1044
1045 // Save the mapping from offset in the data to line/column numbers in
1046 // the original input file. Will be used for better error messages only.
1047 // If there's an expected break before this char, the slot in the mapping
1048 // vector will already be set for this char; don't overwrite it.
1049 if (tp.dataToBreak.length() > tp.srcLine->size()) {
1050 tp.srcLine->addElement(lineNum, status);
1051 tp.srcCol ->addElement(column, status);
1052 }
1053 break;
1054
1055
1056 case PARSE_NUM:
1057 // We are parsing an expected numeric tag value, like <1234>,
1058 // within a chunk of data.
1059 if (u_isUWhiteSpace(c)) {
1060 break;
1061 }
1062
1063 if (c == u'>') {
1064 // Finished the number. Add the info to the expected break data,
1065 // and switch parse state back to doing plain data.
1066 parseState = PARSE_DATA;
1067 if (tagValue == 0) {
1068 tagValue = -1;
1069 }
1070 int32_t breakIdx = tp.dataToBreak.length();
1071 tp.expectedBreaks->setSize(breakIdx+1);
1072 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1073 tp.srcLine->setSize(breakIdx+1);
1074 tp.srcLine->setElementAt(lineNum, breakIdx);
1075 tp.srcCol ->setSize(breakIdx+1);
1076 tp.srcCol ->setElementAt(column, breakIdx);
1077 break;
1078 }
1079
1080 if (u_isdigit(c)) {
1081 tagValue = tagValue*10 + u_charDigitValue(c);
1082 break;
1083 }
1084
1085 errln("Syntax Error in test file at line %d, col %d",
1086 lineNum, column);
1087 parseState = PARSE_COMMENT;
1088 goto end_test; // Stop the test
1089 break;
1090 }
1091
1092
1093 if (U_FAILURE(status)) {
1094 dataerrln("ICU Error %s while parsing test file at line %d.",
1095 u_errorName(status), lineNum);
1096 status = U_ZERO_ERROR;
1097 goto end_test; // Stop the test
1098 }
1099
1100 }
1101
1102 // Reached end of test file. Raise an error if parseState indicates that we are
1103 // within a block that should have been terminated.
1104
1105 if (parseState == PARSE_RULES) {
1106 errln("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
1107 lineNum, rulesFirstLine);
1108 }
1109 if (parseState == PARSE_DATA) {
1110 errln("rbbitst.txt:%d <data> block not closed.", lineNum);
1111 }
1112
1113 //
1114 infoln("TestExtended total time in createLineInstance (nsec):\t%llu\n", durationOpen);
1115 infoln("TestExtended total time in linebreak test execute (nsec):\t%llu\n", durationUse);
1116
1117
1118 end_test:
1119 delete [] testFile;
1120 #endif
1121 }
1122
1123
1124 //-------------------------------------------------------------------------------
1125 //
1126 // TestDictRules create a break iterator from source rules that includes a
1127 // dictionary range. Regression for bug #7130. Source rules
1128 // do not declare a break iterator type (word, line, sentence, etc.
1129 // but the dictionary code, without a type, would loop.
1130 //
1131 //-------------------------------------------------------------------------------
1132 void RBBITest::TestDictRules() {
1133 const char *rules = "$dictionary = [a-z]; \n"
1134 "!!forward; \n"
1135 "$dictionary $dictionary; \n"
1136 "!!reverse; \n"
1137 "$dictionary $dictionary; \n";
1138 const char *text = "aa";
1139 UErrorCode status = U_ZERO_ERROR;
1140 UParseError parseError;
1141
1142 RuleBasedBreakIterator bi(rules, parseError, status);
1143 if (U_SUCCESS(status)) {
1144 UnicodeString utext = text;
1145 bi.setText(utext);
1146 int32_t position;
1147 int32_t loops;
1148 for (loops = 0; loops<10; loops++) {
1149 position = bi.next();
1150 if (position == RuleBasedBreakIterator::DONE) {
1151 break;
1152 }
1153 }
1154 TEST_ASSERT(loops == 1);
1155 } else {
1156 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1157 }
1158 }
1159
1160
1161
1162 //-------------------------------------------------------------------------------
1163 //
1164 // ReadAndConvertFile Read a text data file, convert it to UChars, and
1165 // return the data in one big UChar * buffer, which the caller must delete.
1166 //
1167 // parameters:
1168 // fileName: the name of the file, with no directory part. The test data directory
1169 // is assumed.
1170 // ulen an out parameter, receives the actual length (in UChars) of the file data.
1171 // encoding The file encoding. If the file contains a BOM, that will override the encoding
1172 // specified here. The BOM, if it exists, will be stripped from the returned data.
1173 // Pass NULL for the system default encoding.
1174 // status
1175 // returns:
1176 // The file data, converted to UChar.
1177 // The caller must delete this when done with
1178 // delete [] theBuffer;
1179 //
1180 // TODO: This is a clone of RegexTest::ReadAndConvertFile.
1181 // Move this function to some common place.
1182 //
1183 //--------------------------------------------------------------------------------
1184 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
1185 UChar *retPtr = NULL;
1186 char *fileBuf = NULL;
1187 UConverter* conv = NULL;
1188 FILE *f = NULL;
1189
1190 ulen = 0;
1191 if (U_FAILURE(status)) {
1192 return retPtr;
1193 }
1194
1195 //
1196 // Open the file.
1197 //
1198 f = fopen(fileName, "rb");
1199 if (f == 0) {
1200 dataerrln("Error opening test data file %s\n", fileName);
1201 status = U_FILE_ACCESS_ERROR;
1202 return NULL;
1203 }
1204 //
1205 // Read it in
1206 //
1207 int fileSize;
1208 int amt_read;
1209
1210 fseek( f, 0, SEEK_END);
1211 fileSize = ftell(f);
1212 fileBuf = new char[fileSize];
1213 fseek(f, 0, SEEK_SET);
1214 amt_read = static_cast<int>(fread(fileBuf, 1, fileSize, f));
1215 if (amt_read != fileSize || fileSize <= 0) {
1216 errln("Error reading test data file.");
1217 goto cleanUpAndReturn;
1218 }
1219
1220 //
1221 // Look for a Unicode Signature (BOM) on the data just read
1222 //
1223 int32_t signatureLength;
1224 const char * fileBufC;
1225 const char* bomEncoding;
1226
1227 fileBufC = fileBuf;
1228 bomEncoding = ucnv_detectUnicodeSignature(
1229 fileBuf, fileSize, &signatureLength, &status);
1230 if(bomEncoding!=NULL ){
1231 fileBufC += signatureLength;
1232 fileSize -= signatureLength;
1233 encoding = bomEncoding;
1234 }
1235
1236 //
1237 // Open a converter to take the rule file to UTF-16
1238 //
1239 conv = ucnv_open(encoding, &status);
1240 if (U_FAILURE(status)) {
1241 goto cleanUpAndReturn;
1242 }
1243
1244 //
1245 // Convert the rules to UChar.
1246 // Preflight first to determine required buffer size.
1247 //
1248 ulen = ucnv_toUChars(conv,
1249 NULL, // dest,
1250 0, // destCapacity,
1251 fileBufC,
1252 fileSize,
1253 &status);
1254 if (status == U_BUFFER_OVERFLOW_ERROR) {
1255 // Buffer Overflow is expected from the preflight operation.
1256 status = U_ZERO_ERROR;
1257
1258 retPtr = new UChar[ulen+1];
1259 ucnv_toUChars(conv,
1260 retPtr, // dest,
1261 ulen+1,
1262 fileBufC,
1263 fileSize,
1264 &status);
1265 }
1266
1267 cleanUpAndReturn:
1268 fclose(f);
1269 delete []fileBuf;
1270 ucnv_close(conv);
1271 if (U_FAILURE(status)) {
1272 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1273 delete []retPtr;
1274 retPtr = 0;
1275 ulen = 0;
1276 }
1277 return retPtr;
1278 }
1279
1280
1281
1282 //--------------------------------------------------------------------------------------------
1283 //
1284 // Run tests from each of the boundary test data files distributed by the Unicode Consortium
1285 //
1286 //-------------------------------------------------------------------------------------------
1287 void RBBITest::TestUnicodeFiles() {
1288 RuleBasedBreakIterator *bi;
1289 UErrorCode status = U_ZERO_ERROR;
1290
1291 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1292 TEST_ASSERT_SUCCESS(status);
1293 if (U_SUCCESS(status)) {
1294 runUnicodeTestData("GraphemeBreakTest.txt", bi);
1295 }
1296 delete bi;
1297
1298 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1299 TEST_ASSERT_SUCCESS(status);
1300 if (U_SUCCESS(status)) {
1301 runUnicodeTestData("WordBreakTest.txt", bi);
1302 }
1303 delete bi;
1304
1305 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1306 TEST_ASSERT_SUCCESS(status);
1307 if (U_SUCCESS(status)) {
1308 runUnicodeTestData("SentenceBreakTest.txt", bi);
1309 }
1310 delete bi;
1311
1312 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1313 TEST_ASSERT_SUCCESS(status);
1314 if (U_SUCCESS(status)) {
1315 runUnicodeTestData("LineBreakTest.txt", bi);
1316 }
1317 delete bi;
1318 }
1319
1320
1321 // Check for test cases from the Unicode test data files that are known to fail
1322 // and should be skipped as known issues because ICU does not fully implement
1323 // the Unicode specifications, or because ICU includes tailorings that differ from
1324 // the Unicode standard.
1325 //
1326 // Test cases are identified by the test data sequence, which tends to be more stable
1327 // across Unicode versions than the test file line numbers.
1328 //
1329 // The test case with ticket "10666" is a dummy, included as an example.
1330
1331 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
1332 static struct TestCase {
1333 const char *fTicketNum;
1334 const char *fFileName;
1335 const UChar *fString;
1336 } badTestCases[] = {
1337 {"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"}, // Fake example, for illustration.
1338 // Issue 8151, move the Finnish tailoring of the line break of hyphens to root.
1339 // This probably ultimately wants to be resolved by updating UAX-14, but in the mean time
1340 // ICU is out of sync with Unicode.
1341 {"8151", "LineBreakTest.txt", u"-#"},
1342 {"8151", "LineBreakTest.txt", u"\u002d\u0308\u0023"},
1343 {"8151", "LineBreakTest.txt", u"\u002d\u00a7"},
1344 {"8151", "LineBreakTest.txt", u"\u002d\u0308\u00a7"},
1345 {"8151", "LineBreakTest.txt", u"\u002d\U00050005"},
1346 {"8151", "LineBreakTest.txt", u"\u002d\u0308\U00050005"},
1347 {"8151", "LineBreakTest.txt", u"\u002d\u0e01"},
1348 {"8151", "LineBreakTest.txt", u"\u002d\u0308\u0e01"},
1349
1350 // Issue ICU-12017 Improve line break around numbers
1351 {"12017", "LineBreakTest.txt", u"\u002C\u0030"}, // ",0"
1352 {"12017", "LineBreakTest.txt", u"\u002C\u0308\u0030"},
1353 {"12017", "LineBreakTest.txt", u"find .com"},
1354 {"12017", "LineBreakTest.txt", u"equals .35 cents"},
1355 {"12017", "LineBreakTest.txt", u"a.2 "},
1356 {"12017", "LineBreakTest.txt", u"a.2 \u0915"},
1357 {"12017", "LineBreakTest.txt", u"a.2 \u672C"},
1358 {"12017", "LineBreakTest.txt", u"a.2\u3000\u672C"},
1359 {"12017", "LineBreakTest.txt", u"a.2\u3000\u307E"},
1360 {"12017", "LineBreakTest.txt", u"a.2\u3000\u0033"},
1361 {"12017", "LineBreakTest.txt", u"A.1 \uBABB"},
1362 {"12017", "LineBreakTest.txt", u"\uBD24\uC5B4\u002E\u0020\u0041\u002E\u0032\u0020\uBCFC"},
1363 {"12017", "LineBreakTest.txt", u"\uBD10\uC694\u002E\u0020\u0041\u002E\u0033\u0020\uBABB"},
1364 {"12017", "LineBreakTest.txt", u"\uC694\u002E\u0020\u0041\u002E\u0034\u0020\uBABB"},
1365 {"12017", "LineBreakTest.txt", u"a.2\u3000\u300C"},
1366 };
1367
1368 for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
1369 const TestCase &badCase = badTestCases[n];
1370 if (!strcmp(fileName, badCase.fFileName) &&
1371 testCase == UnicodeString(badCase.fString)) {
1372 return logKnownIssue(badCase.fTicketNum);
1373 }
1374 }
1375 return FALSE;
1376 }
1377
1378
1379 //--------------------------------------------------------------------------------------------
1380 //
1381 // Run tests from one of the boundary test data files distributed by the Unicode Consortium
1382 //
1383 //-------------------------------------------------------------------------------------------
1384 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1385 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1386 UErrorCode status = U_ZERO_ERROR;
1387
1388 //
1389 // Open and read the test data file, put it into a UnicodeString.
1390 //
1391 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1392 char testFileName[1000];
1393 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1394 dataerrln("Can't open test data. Path too long.");
1395 return;
1396 }
1397 strcpy(testFileName, testDataDirectory);
1398 strcat(testFileName, fileName);
1399
1400 logln("Opening data file %s\n", fileName);
1401
1402 int len;
1403 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1404 if (status != U_FILE_ACCESS_ERROR) {
1405 TEST_ASSERT_SUCCESS(status);
1406 TEST_ASSERT(testFile != NULL);
1407 }
1408 if (U_FAILURE(status) || testFile == NULL) {
1409 return; /* something went wrong, error already output */
1410 }
1411 UnicodeString testFileAsString(TRUE, testFile, len);
1412
1413 //
1414 // Parse the test data file using a regular expression.
1415 // Each kind of token is recognized in its own capture group; what type of item was scanned
1416 // is identified by which group had a match.
1417 //
1418 // Caputure Group # 1 2 3 4 5
1419 // Parses this item: divide x hex digits comment \n unrecognized \n
1420 //
1421 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1422 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1423 UnicodeString testString;
1424 UVector32 breakPositions(status);
1425 int lineNumber = 1;
1426 TEST_ASSERT_SUCCESS(status);
1427 if (U_FAILURE(status)) {
1428 return;
1429 }
1430
1431 //
1432 // Scan through each test case, building up the string to be broken in testString,
1433 // and the positions that should be boundaries in the breakPositions vector.
1434 //
1435 int spin = 0;
1436 while (tokenMatcher.find()) {
1437 if(tokenMatcher.hitEnd()) {
1438 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for.
1439 This occurred when the text file was corrupt (wasn't marked as UTF-8)
1440 and caused an infinite loop here on EBCDIC systems!
1441 */
1442 fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1443 // return;
1444 }
1445 if (tokenMatcher.start(1, status) >= 0) {
1446 // Scanned a divide sign, indicating a break position in the test data.
1447 if (testString.length()>0) {
1448 breakPositions.addElement(testString.length(), status);
1449 }
1450 }
1451 else if (tokenMatcher.start(2, status) >= 0) {
1452 // Scanned an 'x', meaning no break at this position in the test data
1453 // Nothing to be done here.
1454 }
1455 else if (tokenMatcher.start(3, status) >= 0) {
1456 // Scanned Hex digits. Convert them to binary, append to the character data string.
1457 const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1458 int length = hexNumber.length();
1459 if (length<=8) {
1460 char buf[10];
1461 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1462 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1463 if (c<=0x10ffff) {
1464 testString.append(c);
1465 } else {
1466 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1467 fileName, lineNumber);
1468 }
1469 } else {
1470 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1471 fileName, lineNumber);
1472 }
1473 }
1474 else if (tokenMatcher.start(4, status) >= 0) {
1475 // Scanned to end of a line, possibly skipping over a comment in the process.
1476 // If the line from the file contained test data, run the test now.
1477 if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
1478 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1479 }
1480
1481 // Clear out this test case.
1482 // The string and breakPositions vector will be refilled as the next
1483 // test case is parsed.
1484 testString.remove();
1485 breakPositions.removeAllElements();
1486 lineNumber++;
1487 } else {
1488 // Scanner catchall. Something unrecognized appeared on the line.
1489 char token[16];
1490 UnicodeString uToken = tokenMatcher.group(0, status);
1491 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1492 token[sizeof(token)-1] = 0;
1493 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token);
1494
1495 // Clean up, in preparation for continuing with the next line.
1496 testString.remove();
1497 breakPositions.removeAllElements();
1498 lineNumber++;
1499 }
1500 TEST_ASSERT_SUCCESS(status);
1501 if (U_FAILURE(status)) {
1502 break;
1503 }
1504 }
1505
1506 delete [] testFile;
1507 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
1508 }
1509
1510 //--------------------------------------------------------------------------------------------
1511 //
1512 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium
1513 // test data files. Do only a simple, forward-only check -
1514 // this test is mostly to check that ICU and the Unicode
1515 // data agree with each other.
1516 //
1517 //--------------------------------------------------------------------------------------------
1518 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1519 const UnicodeString &testString, // Text data to be broken
1520 UVector32 *breakPositions, // Positions where breaks should be found.
1521 RuleBasedBreakIterator *bi) {
1522 int32_t pos; // Break Position in the test string
1523 int32_t expectedI = 0; // Index of expected break position in the vector of expected results.
1524 int32_t expectedPos; // Expected break position (index into test string)
1525
1526 bi->setText(testString);
1527 pos = bi->first();
1528 pos = bi->next();
1529
1530 while (pos != BreakIterator::DONE) {
1531 if (expectedI >= breakPositions->size()) {
1532 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1533 testFileName, lineNumber, pos);
1534 break;
1535 }
1536 expectedPos = breakPositions->elementAti(expectedI);
1537 if (pos < expectedPos) {
1538 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1539 testFileName, lineNumber, pos);
1540 break;
1541 }
1542 if (pos > expectedPos) {
1543 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1544 testFileName, lineNumber, expectedPos);
1545 break;
1546 }
1547 pos = bi->next();
1548 expectedI++;
1549 }
1550
1551 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1552 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1553 testFileName, lineNumber, breakPositions->elementAti(expectedI));
1554 }
1555 }
1556
1557
1558
1559 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1560 //---------------------------------------------------------------------------------------
1561 //
1562 // classs RBBIMonkeyKind
1563 //
1564 // Monkey Test for Break Iteration
1565 // Abstract interface class. Concrete derived classes independently
1566 // implement the break rules for different iterator types.
1567 //
1568 // The Monkey Test itself uses doesn't know which type of break iterator it is
1569 // testing, but works purely in terms of the interface defined here.
1570 //
1571 //---------------------------------------------------------------------------------------
1572 class RBBIMonkeyKind {
1573 public:
1574 // Return a UVector of UnicodeSets, representing the character classes used
1575 // for this type of iterator.
1576 virtual UVector *charClasses() = 0;
1577
1578 // Set the test text on which subsequent calls to next() will operate
1579 virtual void setText(const UnicodeString &s) = 0;
1580
1581 // Find the next break postion, starting from the prev break position, or from zero.
1582 // Return -1 after reaching end of string.
1583 virtual int32_t next(int32_t i) = 0;
1584
1585 // Name of each character class, parallel with charClasses. Used for debugging output
1586 // of characters.
1587 virtual std::vector<std::string>& characterClassNames();
1588
1589 void setAppliedRule(int32_t position, const char* value);
1590
1591 std::string getAppliedRule(int32_t position);
1592
1593 virtual ~RBBIMonkeyKind();
1594 UErrorCode deferredStatus;
1595
1596 std::string classNameFromCodepoint(const UChar32 c);
1597 unsigned int maxClassNameSize();
1598
1599 protected:
1600 RBBIMonkeyKind();
1601 std::vector<std::string> classNames;
1602 std::vector<std::string> appliedRules;
1603
1604 // Clear `appliedRules` and fill it with empty strings in the size of test text.
1605 void prepareAppliedRules(int32_t size );
1606
1607 private:
1608
1609 };
1610
1611 RBBIMonkeyKind::RBBIMonkeyKind() {
1612 deferredStatus = U_ZERO_ERROR;
1613 }
1614
1615 RBBIMonkeyKind::~RBBIMonkeyKind() {
1616 }
1617
1618 std::vector<std::string>& RBBIMonkeyKind::characterClassNames() {
1619 return classNames;
1620 }
1621
1622 void RBBIMonkeyKind::prepareAppliedRules(int32_t size) {
1623 // Remove all the information in the `appliedRules`.
1624 appliedRules.clear();
1625 appliedRules.resize(size + 1);
1626 }
1627
1628 void RBBIMonkeyKind::setAppliedRule(int32_t position, const char* value) {
1629 appliedRules[position] = value;
1630 }
1631
1632 std::string RBBIMonkeyKind::getAppliedRule(int32_t position){
1633 return appliedRules[position];
1634 }
1635
1636 std::string RBBIMonkeyKind::classNameFromCodepoint(const UChar32 c) {
1637 // Simply iterate through charClasses to find character's class
1638 for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1639 UnicodeSet *classSet = (UnicodeSet *)charClasses()->elementAt(aClassNum);
1640 if (classSet->contains(c)) {
1641 return classNames[aClassNum];
1642 }
1643 }
1644 U_ASSERT(FALSE); // This should not happen.
1645 return "bad class name";
1646 }
1647
1648 unsigned int RBBIMonkeyKind::maxClassNameSize() {
1649 unsigned int maxSize = 0;
1650 for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1651 if (classNames[aClassNum].size() > maxSize) {
1652 maxSize = classNames[aClassNum].size();
1653 }
1654 }
1655 return maxSize;
1656 }
1657
1658 //----------------------------------------------------------------------------------------
1659 //
1660 // Random Numbers. Similar to standard lib rand() and srand()
1661 // Not using library to
1662 // 1. Get same results on all platforms.
1663 // 2. Get access to current seed, to more easily reproduce failures.
1664 //
1665 //---------------------------------------------------------------------------------------
1666 static uint32_t m_seed = 1;
1667
1668 static uint32_t m_rand()
1669 {
1670 m_seed = m_seed * 1103515245 + 12345;
1671 return (uint32_t)(m_seed/65536) % 32768;
1672 }
1673
1674
1675 //------------------------------------------------------------------------------------------
1676 //
1677 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation
1678 // of RBBIMonkeyKind.
1679 //
1680 //------------------------------------------------------------------------------------------
1681 class RBBICharMonkey: public RBBIMonkeyKind {
1682 public:
1683 RBBICharMonkey();
1684 virtual ~RBBICharMonkey();
1685 virtual UVector *charClasses();
1686 virtual void setText(const UnicodeString &s);
1687 virtual int32_t next(int32_t i);
1688 private:
1689 UVector *fSets;
1690
1691 UnicodeSet *fCRLFSet;
1692 UnicodeSet *fControlSet;
1693 UnicodeSet *fExtendSet;
1694 UnicodeSet *fZWJSet;
1695 UnicodeSet *fRegionalIndicatorSet;
1696 UnicodeSet *fPrependSet;
1697 UnicodeSet *fSpacingSet;
1698 UnicodeSet *fLSet;
1699 UnicodeSet *fVSet;
1700 UnicodeSet *fTSet;
1701 UnicodeSet *fLVSet;
1702 UnicodeSet *fLVTSet;
1703 UnicodeSet *fHangulSet;
1704 UnicodeSet *fExtendedPictSet;
1705 UnicodeSet *fViramaSet;
1706 UnicodeSet *fLinkingConsonantSet;
1707 UnicodeSet *fExtCccZwjSet;
1708 UnicodeSet *fAnySet;
1709
1710 const UnicodeString *fText;
1711 };
1712
1713
1714 RBBICharMonkey::RBBICharMonkey() {
1715 UErrorCode status = U_ZERO_ERROR;
1716
1717 fText = NULL;
1718
1719 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
1720 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
1721 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
1722 fZWJSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
1723 fRegionalIndicatorSet =
1724 new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
1725 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
1726 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
1727 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
1728 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
1729 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
1730 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
1731 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
1732 fHangulSet = new UnicodeSet();
1733 fHangulSet->addAll(*fLSet);
1734 fHangulSet->addAll(*fVSet);
1735 fHangulSet->addAll(*fTSet);
1736 fHangulSet->addAll(*fLVSet);
1737 fHangulSet->addAll(*fLVTSet);
1738
1739 fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
1740 fViramaSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1741 "\\p{Indic_Syllabic_Category=Virama}]", status);
1742 fLinkingConsonantSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1743 "\\p{Indic_Syllabic_Category=Consonant}]", status);
1744 fExtCccZwjSet = new UnicodeSet(u"[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]", status);
1745 fAnySet = new UnicodeSet(0, 0x10ffff);
1746
1747 // Create sets of characters, and add the names of the above character sets.
1748 // In each new ICU release, add new names corresponding to the sets above.
1749 fSets = new UVector(status);
1750
1751 // Important: Keep class names the same as the class contents.
1752 fSets->addElement(fCRLFSet, status); classNames.push_back("CRLF");
1753 fSets->addElement(fControlSet, status); classNames.push_back("Control");
1754 fSets->addElement(fExtendSet, status); classNames.push_back("Extended");
1755 fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
1756 if (!fPrependSet->isEmpty()) {
1757 fSets->addElement(fPrependSet, status); classNames.push_back("Prepend");
1758 }
1759 fSets->addElement(fSpacingSet, status); classNames.push_back("Spacing");
1760 fSets->addElement(fHangulSet, status); classNames.push_back("Hangul");
1761 fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
1762 fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
1763 fSets->addElement(fViramaSet, status); classNames.push_back("Virama");
1764 fSets->addElement(fLinkingConsonantSet, status); classNames.push_back("LinkingConsonant");
1765 fSets->addElement(fExtCccZwjSet, status); classNames.push_back("ExtCcccZwj");
1766 fSets->addElement(fAnySet, status); classNames.push_back("Any");
1767
1768 if (U_FAILURE(status)) {
1769 deferredStatus = status;
1770 }
1771 }
1772
1773
1774 void RBBICharMonkey::setText(const UnicodeString &s) {
1775 fText = &s;
1776 prepareAppliedRules(s.length());
1777 }
1778
1779
1780
1781 int32_t RBBICharMonkey::next(int32_t prevPos) {
1782 int p0, p1, p2, p3; // Indices of the significant code points around the
1783 // break position being tested. The candidate break
1784 // location is before p2.
1785
1786 int breakPos = -1;
1787
1788 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
1789 UChar32 cBase; // for (X Extend*) patterns, the X character.
1790
1791 if (U_FAILURE(deferredStatus)) {
1792 return -1;
1793 }
1794
1795 // Previous break at end of string. return DONE.
1796 if (prevPos >= fText->length()) {
1797 return -1;
1798 }
1799
1800 p0 = p1 = p2 = p3 = prevPos;
1801 c3 = fText->char32At(prevPos);
1802 c0 = c1 = c2 = cBase = 0;
1803 (void)p0; // suppress set but not used warning.
1804 (void)c0;
1805
1806 // Loop runs once per "significant" character position in the input text.
1807 for (;;) {
1808 // Move all of the positions forward in the input string.
1809 p0 = p1; c0 = c1;
1810 p1 = p2; c1 = c2;
1811 p2 = p3; c2 = c3;
1812
1813 // Advance p3 by one codepoint
1814 p3 = fText->moveIndex32(p3, 1);
1815 c3 = fText->char32At(p3);
1816
1817 if (p1 == p2) {
1818 // Still warming up the loop. (won't work with zero length strings, but we don't care)
1819 continue;
1820 }
1821
1822 if (p2 == fText->length()) {
1823 setAppliedRule(p2, "End of String");
1824 break;
1825 }
1826
1827 // No Extend or Format characters may appear between the CR and LF,
1828 // which requires the additional check for p2 immediately following p1.
1829 //
1830 if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
1831 setAppliedRule(p2, "GB3 CR x LF");
1832 continue;
1833 }
1834
1835 if (fControlSet->contains(c1) ||
1836 c1 == 0x0D ||
1837 c1 == 0x0A) {
1838 setAppliedRule(p2, "GB4 ( Control | CR | LF ) <break>");
1839 break;
1840 }
1841
1842 if (fControlSet->contains(c2) ||
1843 c2 == 0x0D ||
1844 c2 == 0x0A) {
1845 setAppliedRule(p2, "GB5 <break> ( Control | CR | LF )");
1846 break;
1847 }
1848
1849 if (fLSet->contains(c1) &&
1850 (fLSet->contains(c2) ||
1851 fVSet->contains(c2) ||
1852 fLVSet->contains(c2) ||
1853 fLVTSet->contains(c2))) {
1854 setAppliedRule(p2, "GB6 L x ( L | V | LV | LVT )");
1855 continue;
1856 }
1857
1858 if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
1859 (fVSet->contains(c2) || fTSet->contains(c2))) {
1860 setAppliedRule(p2, "GB7 ( LV | V ) x ( V | T )");
1861 continue;
1862 }
1863
1864 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
1865 fTSet->contains(c2)) {
1866 setAppliedRule(p2, "GB8 ( LVT | T) x T");
1867 continue;
1868 }
1869
1870 if (fExtendSet->contains(c2) || fZWJSet->contains(c2)) {
1871 if (!fExtendSet->contains(c1)) {
1872 cBase = c1;
1873 }
1874 setAppliedRule(p2, "GB9 x (Extend | ZWJ)");
1875 continue;
1876 }
1877
1878 if (fSpacingSet->contains(c2)) {
1879 setAppliedRule(p2, "GB9a x SpacingMark");
1880 continue;
1881 }
1882
1883 if (fPrependSet->contains(c1)) {
1884 setAppliedRule(p2, "GB9b Prepend x");
1885 continue;
1886 }
1887
1888 // Note: Viramas are also included in the ExtCccZwj class.
1889 if (fLinkingConsonantSet->contains(c2)) {
1890 int pi = p1;
1891 bool sawVirama = false;
1892 while (pi > 0 && fExtCccZwjSet->contains(fText->char32At(pi))) {
1893 if (fViramaSet->contains(fText->char32At(pi))) {
1894 sawVirama = true;
1895 }
1896 pi = fText->moveIndex32(pi, -1);
1897 }
1898 if (sawVirama && fLinkingConsonantSet->contains(fText->char32At(pi))) {
1899 setAppliedRule(p2, "GB9.3 LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant");
1900 continue;
1901 }
1902 }
1903
1904 if (fExtendedPictSet->contains(cBase) && fZWJSet->contains(c1) && fExtendedPictSet->contains(c2)) {
1905 setAppliedRule(p2, "GB11 Extended_Pictographic Extend * ZWJ x Extended_Pictographic");
1906 continue;
1907 }
1908
1909 // Note: The first if condition is a little tricky. We only need to force
1910 // a break if there are three or more contiguous RIs. If there are
1911 // only two, a break following will occur via other rules, and will include
1912 // any trailing extend characters, which is needed behavior.
1913 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
1914 && fRegionalIndicatorSet->contains(c2)) {
1915 setAppliedRule(p2, "GB12-13 Regional_Indicator x Regional_Indicator");
1916 break;
1917 }
1918 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
1919 setAppliedRule(p2, "GB12-13 Regional_Indicator x Regional_Indicator");
1920 continue;
1921 }
1922
1923 setAppliedRule(p2, "GB999 Any <break> Any");
1924 break;
1925 }
1926
1927 breakPos = p2;
1928 return breakPos;
1929 }
1930
1931
1932
1933 UVector *RBBICharMonkey::charClasses() {
1934 return fSets;
1935 }
1936
1937 RBBICharMonkey::~RBBICharMonkey() {
1938 delete fSets;
1939 delete fCRLFSet;
1940 delete fControlSet;
1941 delete fExtendSet;
1942 delete fRegionalIndicatorSet;
1943 delete fPrependSet;
1944 delete fSpacingSet;
1945 delete fLSet;
1946 delete fVSet;
1947 delete fTSet;
1948 delete fLVSet;
1949 delete fLVTSet;
1950 delete fHangulSet;
1951 delete fAnySet;
1952 delete fZWJSet;
1953 delete fExtendedPictSet;
1954 delete fViramaSet;
1955 delete fLinkingConsonantSet;
1956 delete fExtCccZwjSet;
1957 }
1958
1959 //------------------------------------------------------------------------------------------
1960 //
1961 // class RBBIWordMonkey Word Break specific implementation
1962 // of RBBIMonkeyKind.
1963 //
1964 //------------------------------------------------------------------------------------------
1965 class RBBIWordMonkey: public RBBIMonkeyKind {
1966 public:
1967 RBBIWordMonkey();
1968 virtual ~RBBIWordMonkey();
1969 virtual UVector *charClasses();
1970 virtual void setText(const UnicodeString &s);
1971 virtual int32_t next(int32_t i);
1972 private:
1973 UVector *fSets;
1974
1975 UnicodeSet *fCRSet;
1976 UnicodeSet *fLFSet;
1977 UnicodeSet *fNewlineSet;
1978 UnicodeSet *fRegionalIndicatorSet;
1979 UnicodeSet *fKatakanaSet;
1980 UnicodeSet *fHebrew_LetterSet;
1981 UnicodeSet *fALetterSet;
1982 UnicodeSet *fSingle_QuoteSet;
1983 UnicodeSet *fDouble_QuoteSet;
1984 UnicodeSet *fMidNumLetSet;
1985 UnicodeSet *fMidLetterSet;
1986 UnicodeSet *fMidNumSet;
1987 UnicodeSet *fNumericSet;
1988 UnicodeSet *fFormatSet;
1989 UnicodeSet *fOtherSet;
1990 UnicodeSet *fExtendSet;
1991 UnicodeSet *fExtendNumLetSet;
1992 UnicodeSet *fWSegSpaceSet;
1993 UnicodeSet *fDictionarySet;
1994 UnicodeSet *fZWJSet;
1995 UnicodeSet *fExtendedPictSet;
1996
1997 const UnicodeString *fText;
1998 };
1999
2000
2001 RBBIWordMonkey::RBBIWordMonkey()
2002 {
2003 UErrorCode status = U_ZERO_ERROR;
2004
2005 fSets = new UVector(status);
2006
2007 fCRSet = new UnicodeSet(u"[\\p{Word_Break = CR}]", status);
2008 fLFSet = new UnicodeSet(u"[\\p{Word_Break = LF}]", status);
2009 fNewlineSet = new UnicodeSet(u"[\\p{Word_Break = Newline}]", status);
2010 fKatakanaSet = new UnicodeSet(u"[\\p{Word_Break = Katakana}]", status);
2011 fRegionalIndicatorSet = new UnicodeSet(u"[\\p{Word_Break = Regional_Indicator}]", status);
2012 fHebrew_LetterSet = new UnicodeSet(u"[\\p{Word_Break = Hebrew_Letter}]", status);
2013 fALetterSet = new UnicodeSet(u"[\\p{Word_Break = ALetter}]", status);
2014 fSingle_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Single_Quote}]", status);
2015 fDouble_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Double_Quote}]", status);
2016 fMidNumLetSet = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]", status);
2017 fMidLetterSet = new UnicodeSet(u"[\\p{Word_Break = MidLetter} - [\\:]]", status);
2018 fMidNumSet = new UnicodeSet(u"[\\p{Word_Break = MidNum}]", status);
2019 fNumericSet = new UnicodeSet(u"[\\p{Word_Break = Numeric}]", status);
2020 fFormatSet = new UnicodeSet(u"[\\p{Word_Break = Format}]", status);
2021 fExtendNumLetSet = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
2022 // There are some sc=Hani characters with WB=Extend.
2023 // The break rules need to pick one or the other because
2024 // Extend overlapping with something else is messy.
2025 // For Unicode 13, we chose to keep U+16FF0 & U+16FF1
2026 // in $Han (for $dictionary) and out of $Extend.
2027 fExtendSet = new UnicodeSet(u"[\\p{Word_Break = Extend}-[:Hani:]]", status);
2028 fWSegSpaceSet = new UnicodeSet(u"[\\p{Word_Break = WSegSpace}]", status);
2029
2030 fZWJSet = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]", status);
2031 fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
2032
2033 fDictionarySet = new UnicodeSet(u"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
2034 fDictionarySet->addAll(*fKatakanaSet);
2035 fDictionarySet->addAll(UnicodeSet(u"[\\p{LineBreak = Complex_Context}]", status));
2036
2037 fALetterSet->removeAll(*fDictionarySet);
2038
2039 fOtherSet = new UnicodeSet();
2040 if(U_FAILURE(status)) {
2041 IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
2042 deferredStatus = status;
2043 return;
2044 }
2045
2046 fOtherSet->complement();
2047 fOtherSet->removeAll(*fCRSet);
2048 fOtherSet->removeAll(*fLFSet);
2049 fOtherSet->removeAll(*fNewlineSet);
2050 fOtherSet->removeAll(*fKatakanaSet);
2051 fOtherSet->removeAll(*fHebrew_LetterSet);
2052 fOtherSet->removeAll(*fALetterSet);
2053 fOtherSet->removeAll(*fSingle_QuoteSet);
2054 fOtherSet->removeAll(*fDouble_QuoteSet);
2055 fOtherSet->removeAll(*fMidLetterSet);
2056 fOtherSet->removeAll(*fMidNumSet);
2057 fOtherSet->removeAll(*fNumericSet);
2058 fOtherSet->removeAll(*fExtendNumLetSet);
2059 fOtherSet->removeAll(*fWSegSpaceSet);
2060 fOtherSet->removeAll(*fFormatSet);
2061 fOtherSet->removeAll(*fExtendSet);
2062 fOtherSet->removeAll(*fRegionalIndicatorSet);
2063 fOtherSet->removeAll(*fZWJSet);
2064 fOtherSet->removeAll(*fExtendedPictSet);
2065
2066 // Inhibit dictionary characters from being tested at all.
2067 fOtherSet->removeAll(*fDictionarySet);
2068
2069 // Add classes and their names
2070 fSets->addElement(fCRSet, status); classNames.push_back("CR");
2071 fSets->addElement(fLFSet, status); classNames.push_back("LF");
2072 fSets->addElement(fNewlineSet, status); classNames.push_back("Newline");
2073 fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
2074 fSets->addElement(fHebrew_LetterSet, status); classNames.push_back("Hebrew");
2075 fSets->addElement(fALetterSet, status); classNames.push_back("ALetter");
2076 fSets->addElement(fSingle_QuoteSet, status); classNames.push_back("Single Quote");
2077 fSets->addElement(fDouble_QuoteSet, status); classNames.push_back("Double Quote");
2078 // Omit Katakana from fSets, which omits Katakana characters
2079 // from the test data. They are all in the dictionary set,
2080 // which this (old, to be retired) monkey test cannot handle.
2081 //fSets->addElement(fKatakanaSet, status);
2082
2083 fSets->addElement(fMidLetterSet, status); classNames.push_back("MidLetter");
2084 fSets->addElement(fMidNumLetSet, status); classNames.push_back("MidNumLet");
2085 fSets->addElement(fMidNumSet, status); classNames.push_back("MidNum");
2086 fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
2087 fSets->addElement(fFormatSet, status); classNames.push_back("Format");
2088 fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
2089 fSets->addElement(fOtherSet, status); classNames.push_back("Other");
2090 fSets->addElement(fExtendNumLetSet, status); classNames.push_back("ExtendNumLet");
2091 fSets->addElement(fWSegSpaceSet, status); classNames.push_back("WSegSpace");
2092
2093 fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
2094 fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
2095
2096 if (U_FAILURE(status)) {
2097 deferredStatus = status;
2098 }
2099 }
2100
2101 void RBBIWordMonkey::setText(const UnicodeString &s) {
2102 fText = &s;
2103 prepareAppliedRules(s.length());
2104 }
2105
2106
2107 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2108 int p0, p1, p2, p3; // Indices of the significant code points around the
2109 // break position being tested. The candidate break
2110 // location is before p2.
2111
2112 int breakPos = -1;
2113
2114 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2115
2116 if (U_FAILURE(deferredStatus)) {
2117 return -1;
2118 }
2119
2120 // Prev break at end of string. return DONE.
2121 if (prevPos >= fText->length()) {
2122 return -1;
2123 }
2124 p0 = p1 = p2 = p3 = prevPos;
2125 c3 = fText->char32At(prevPos);
2126 c0 = c1 = c2 = 0;
2127 (void)p0; // Suppress set but not used warning.
2128
2129 // Loop runs once per "significant" character position in the input text.
2130 for (;;) {
2131 // Move all of the positions forward in the input string.
2132 p0 = p1; c0 = c1;
2133 p1 = p2; c1 = c2;
2134 p2 = p3; c2 = c3;
2135
2136 // Advance p3 by X(Extend | Format)* Rule 4
2137 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2138 do {
2139 p3 = fText->moveIndex32(p3, 1);
2140 c3 = fText->char32At(p3);
2141 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2142 break;
2143 }
2144 }
2145 while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
2146
2147
2148 if (p1 == p2) {
2149 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2150 continue;
2151 }
2152
2153 if (p2 == fText->length()) {
2154 // Reached end of string. Always a break position.
2155 break;
2156 }
2157
2158 // No Extend or Format characters may appear between the CR and LF,
2159 // which requires the additional check for p2 immediately following p1.
2160 //
2161 if (c1==0x0D && c2==0x0A) {
2162 setAppliedRule(p2, "WB3 CR x LF");
2163 continue;
2164 }
2165
2166 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2167 setAppliedRule(p2, "WB3a Break before and after newlines (including CR and LF)");
2168 break;
2169 }
2170 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2171 setAppliedRule(p2, "WB3a Break before and after newlines (including CR and LF)");
2172 break;
2173 }
2174
2175 // Not ignoring extend chars, so peek into input text to
2176 // get the potential ZWJ, the character immediately preceding c2.
2177 // Sloppy UChar32 indexing: p2-1 may reference trail half
2178 // but char32At will get the full code point.
2179 if (fZWJSet->contains(fText->char32At(p2 - 1)) && fExtendedPictSet->contains(c2)){
2180 setAppliedRule(p2, "WB3c ZWJ x Extended_Pictographic");
2181 continue;
2182 }
2183
2184 if (fWSegSpaceSet->contains(fText->char32At(p2-1)) && fWSegSpaceSet->contains(c2)) {
2185 setAppliedRule(p2, "WB3d Keep horizontal whitespace together.");
2186 continue;
2187 }
2188
2189 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2190 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2191 setAppliedRule(p2, "WB4 (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)");
2192 continue;
2193 }
2194
2195 if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2196 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2197 (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2198 setAppliedRule(p2,
2199 "WB6 (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter _Letter)");
2200 continue;
2201 }
2202
2203 if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2204 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2205 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2206 setAppliedRule(p2,
2207 "WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter)");
2208 continue;
2209 }
2210
2211 if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2212 setAppliedRule(p2, "WB7a Hebrew_Letter x Single_Quote");
2213 continue;
2214 }
2215
2216 if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2217 setAppliedRule(p2, "WB7b Hebrew_Letter x Double_Quote Hebrew_Letter");
2218 continue;
2219 }
2220
2221 if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2222 setAppliedRule(p2, "WB7c Hebrew_Letter Double_Quote x Hebrew_Letter");
2223 continue;
2224 }
2225
2226 if (fNumericSet->contains(c1) &&
2227 fNumericSet->contains(c2)) {
2228 setAppliedRule(p2, "WB8 Numeric x Numeric");
2229 continue;
2230 }
2231
2232 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2233 fNumericSet->contains(c2)) {
2234 setAppliedRule(p2, "WB9 (ALetter | Hebrew_Letter) x Numeric");
2235 continue;
2236 }
2237
2238 if (fNumericSet->contains(c1) &&
2239 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2240 setAppliedRule(p2, "WB10 Numeric x (ALetter | Hebrew_Letter)");
2241 continue;
2242 }
2243
2244 if (fNumericSet->contains(c0) &&
2245 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2246 fNumericSet->contains(c2)) {
2247 setAppliedRule(p2, "WB11 Numeric (MidNum | MidNumLet | Single_Quote) x Numeric");
2248 continue;
2249 }
2250
2251 if (fNumericSet->contains(c1) &&
2252 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2253 fNumericSet->contains(c3)) {
2254 setAppliedRule(p2, "WB12 Numeric x (MidNum | MidNumLet | SingleQuote) Numeric");
2255 continue;
2256 }
2257
2258 // Note: matches UAX 29 rules, but doesn't come into play for ICU because
2259 // all Katakana are handled by the dictionary breaker.
2260 if (fKatakanaSet->contains(c1) &&
2261 fKatakanaSet->contains(c2)) {
2262 setAppliedRule(p2, "WB13 Katakana x Katakana");
2263 continue;
2264 }
2265
2266 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2267 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2268 fExtendNumLetSet->contains(c2)) {
2269 setAppliedRule(p2,
2270 "WB13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet");
2271 continue;
2272 }
2273
2274 if (fExtendNumLetSet->contains(c1) &&
2275 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2276 fNumericSet->contains(c2) || fKatakanaSet->contains(c2))) {
2277 setAppliedRule(p2, "WB13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)");
2278 continue;
2279 }
2280
2281 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
2282 setAppliedRule(p2, "WB15 - WB17 Group pairs of Regional Indicators.");
2283 break;
2284 }
2285 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2286 setAppliedRule(p2, "WB15 - WB17 Group pairs of Regional Indicators.");
2287 continue;
2288 }
2289
2290 setAppliedRule(p2, "WB999");
2291 break;
2292 }
2293
2294 breakPos = p2;
2295 return breakPos;
2296 }
2297
2298
2299 UVector *RBBIWordMonkey::charClasses() {
2300 return fSets;
2301 }
2302
2303 RBBIWordMonkey::~RBBIWordMonkey() {
2304 delete fSets;
2305 delete fCRSet;
2306 delete fLFSet;
2307 delete fNewlineSet;
2308 delete fKatakanaSet;
2309 delete fHebrew_LetterSet;
2310 delete fALetterSet;
2311 delete fSingle_QuoteSet;
2312 delete fDouble_QuoteSet;
2313 delete fMidNumLetSet;
2314 delete fMidLetterSet;
2315 delete fMidNumSet;
2316 delete fNumericSet;
2317 delete fFormatSet;
2318 delete fExtendSet;
2319 delete fExtendNumLetSet;
2320 delete fWSegSpaceSet;
2321 delete fRegionalIndicatorSet;
2322 delete fDictionarySet;
2323 delete fOtherSet;
2324 delete fZWJSet;
2325 delete fExtendedPictSet;
2326 }
2327
2328
2329
2330
2331 //------------------------------------------------------------------------------------------
2332 //
2333 // class RBBISentMonkey Sentence Break specific implementation
2334 // of RBBIMonkeyKind.
2335 //
2336 //------------------------------------------------------------------------------------------
2337 class RBBISentMonkey: public RBBIMonkeyKind {
2338 public:
2339 RBBISentMonkey();
2340 virtual ~RBBISentMonkey();
2341 virtual UVector *charClasses();
2342 virtual void setText(const UnicodeString &s);
2343 virtual int32_t next(int32_t i);
2344 private:
2345 int moveBack(int posFrom);
2346 int moveForward(int posFrom);
2347 UChar32 cAt(int pos);
2348
2349 UVector *fSets;
2350
2351 UnicodeSet *fSepSet;
2352 UnicodeSet *fFormatSet;
2353 UnicodeSet *fSpSet;
2354 UnicodeSet *fLowerSet;
2355 UnicodeSet *fUpperSet;
2356 UnicodeSet *fOLetterSet;
2357 UnicodeSet *fNumericSet;
2358 UnicodeSet *fATermSet;
2359 UnicodeSet *fSContinueSet;
2360 UnicodeSet *fSTermSet;
2361 UnicodeSet *fCloseSet;
2362 UnicodeSet *fOtherSet;
2363 UnicodeSet *fExtendSet;
2364
2365 const UnicodeString *fText;
2366 };
2367
2368 RBBISentMonkey::RBBISentMonkey()
2369 {
2370 UErrorCode status = U_ZERO_ERROR;
2371
2372 fSets = new UVector(status);
2373
2374 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
2375 // set and made into character classes of their own. For the monkey impl,
2376 // they remain in SEP, since Sep always appears with CR and LF in the rules.
2377 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status);
2378 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status);
2379 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status);
2380 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status);
2381 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status);
2382 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status);
2383 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status);
2384 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status);
2385 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2386 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status);
2387 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status);
2388 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status);
2389 fOtherSet = new UnicodeSet();
2390
2391 if(U_FAILURE(status)) {
2392 deferredStatus = status;
2393 return;
2394 }
2395
2396 fOtherSet->complement();
2397 fOtherSet->removeAll(*fSepSet);
2398 fOtherSet->removeAll(*fFormatSet);
2399 fOtherSet->removeAll(*fSpSet);
2400 fOtherSet->removeAll(*fLowerSet);
2401 fOtherSet->removeAll(*fUpperSet);
2402 fOtherSet->removeAll(*fOLetterSet);
2403 fOtherSet->removeAll(*fNumericSet);
2404 fOtherSet->removeAll(*fATermSet);
2405 fOtherSet->removeAll(*fSContinueSet);
2406 fOtherSet->removeAll(*fSTermSet);
2407 fOtherSet->removeAll(*fCloseSet);
2408 fOtherSet->removeAll(*fExtendSet);
2409
2410 fSets->addElement(fSepSet, status); classNames.push_back("Sep");
2411 fSets->addElement(fFormatSet, status); classNames.push_back("Format");
2412 fSets->addElement(fSpSet, status); classNames.push_back("Sp");
2413 fSets->addElement(fLowerSet, status); classNames.push_back("Lower");
2414 fSets->addElement(fUpperSet, status); classNames.push_back("Upper");
2415 fSets->addElement(fOLetterSet, status); classNames.push_back("OLetter");
2416 fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
2417 fSets->addElement(fATermSet, status); classNames.push_back("ATerm");
2418 fSets->addElement(fSContinueSet, status); classNames.push_back("SContinue");
2419 fSets->addElement(fSTermSet, status); classNames.push_back("STerm");
2420 fSets->addElement(fCloseSet, status); classNames.push_back("Close");
2421 fSets->addElement(fOtherSet, status); classNames.push_back("Other");
2422 fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
2423
2424 if (U_FAILURE(status)) {
2425 deferredStatus = status;
2426 }
2427 }
2428
2429
2430
2431 void RBBISentMonkey::setText(const UnicodeString &s) {
2432 fText = &s;
2433 prepareAppliedRules(s.length());
2434 }
2435
2436 UVector *RBBISentMonkey::charClasses() {
2437 return fSets;
2438 }
2439
2440 // moveBack() Find the "significant" code point preceding the index i.
2441 // Skips over ($Extend | $Format)* .
2442 //
2443 int RBBISentMonkey::moveBack(int i) {
2444 if (i <= 0) {
2445 return -1;
2446 }
2447 UChar32 c;
2448 int32_t j = i;
2449 do {
2450 j = fText->moveIndex32(j, -1);
2451 c = fText->char32At(j);
2452 }
2453 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2454 return j;
2455
2456 }
2457
2458
2459 int RBBISentMonkey::moveForward(int i) {
2460 if (i>=fText->length()) {
2461 return fText->length();
2462 }
2463 UChar32 c;
2464 int32_t j = i;
2465 do {
2466 j = fText->moveIndex32(j, 1);
2467 c = cAt(j);
2468 }
2469 while (fFormatSet->contains(c) || fExtendSet->contains(c));
2470 return j;
2471 }
2472
2473 UChar32 RBBISentMonkey::cAt(int pos) {
2474 if (pos<0 || pos>=fText->length()) {
2475 return -1;
2476 } else {
2477 return fText->char32At(pos);
2478 }
2479 }
2480
2481 int32_t RBBISentMonkey::next(int32_t prevPos) {
2482 int p0, p1, p2, p3; // Indices of the significant code points around the
2483 // break position being tested. The candidate break
2484 // location is before p2.
2485
2486 int breakPos = -1;
2487
2488 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2489 UChar32 c;
2490
2491 if (U_FAILURE(deferredStatus)) {
2492 return -1;
2493 }
2494
2495 // Prev break at end of string. return DONE.
2496 if (prevPos >= fText->length()) {
2497 return -1;
2498 }
2499 p0 = p1 = p2 = p3 = prevPos;
2500 c3 = fText->char32At(prevPos);
2501 c0 = c1 = c2 = 0;
2502 (void)p0; // Suppress set but not used warning.
2503
2504 // Loop runs once per "significant" character position in the input text.
2505 for (;;) {
2506 // Move all of the positions forward in the input string.
2507 p0 = p1; c0 = c1;
2508 p1 = p2; c1 = c2;
2509 p2 = p3; c2 = c3;
2510
2511 // Advance p3 by X(Extend | Format)* Rule 4
2512 p3 = moveForward(p3);
2513 c3 = cAt(p3);
2514
2515 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2516 setAppliedRule(p2, "SB3 CR x LF");
2517 continue;
2518 }
2519
2520 if (fSepSet->contains(c1)) {
2521 p2 = p1+1; // Separators don't combine with Extend or Format.
2522
2523 setAppliedRule(p2, "SB4 Sep <break>");
2524 break;
2525 }
2526
2527 if (p2 >= fText->length()) {
2528 // Reached end of string. Always a break position.
2529 setAppliedRule(p2, "SB4 Sep <break>");
2530 break;
2531 }
2532
2533 if (p2 == prevPos) {
2534 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2535 setAppliedRule(p2, "SB4 Sep <break>");
2536 continue;
2537 }
2538
2539 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) {
2540 setAppliedRule(p2, "SB6 ATerm x Numeric");
2541 continue;
2542 }
2543
2544 if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
2545 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2546 setAppliedRule(p2, "SB7 (Upper | Lower) ATerm x Uppper");
2547 continue;
2548 }
2549
2550 // Note: STerm | ATerm are added to the negated part of the expression by a
2551 // note to the Unicode 5.0 documents.
2552 int p8 = p1;
2553 while (fSpSet->contains(cAt(p8))) {
2554 p8 = moveBack(p8);
2555 }
2556 while (fCloseSet->contains(cAt(p8))) {
2557 p8 = moveBack(p8);
2558 }
2559 if (fATermSet->contains(cAt(p8))) {
2560 p8=p2;
2561 for (;;) {
2562 c = cAt(p8);
2563 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2564 fLowerSet->contains(c) || fSepSet->contains(c) ||
2565 fATermSet->contains(c) || fSTermSet->contains(c)) {
2566
2567 setAppliedRule(p2,
2568 "SB8 ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2569 break;
2570 }
2571 p8 = moveForward(p8);
2572 }
2573 if (fLowerSet->contains(cAt(p8))) {
2574
2575 setAppliedRule(p2,
2576 "SB8 ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2577 continue;
2578 }
2579 }
2580
2581 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2582 p8 = p1;
2583 while (fSpSet->contains(cAt(p8))) {
2584 p8 = moveBack(p8);
2585 }
2586 while (fCloseSet->contains(cAt(p8))) {
2587 p8 = moveBack(p8);
2588 }
2589 c = cAt(p8);
2590 if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2591 setAppliedRule(p2, "SB8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm)");
2592 continue;
2593 }
2594 }
2595
2596 int p9 = p1;
2597 while (fCloseSet->contains(cAt(p9))) {
2598 p9 = moveBack(p9);
2599 }
2600 c = cAt(p9);
2601 if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2602 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2603
2604 setAppliedRule(p2, "SB9 (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)");
2605 continue;
2606 }
2607 }
2608
2609 int p10 = p1;
2610 while (fSpSet->contains(cAt(p10))) {
2611 p10 = moveBack(p10);
2612 }
2613 while (fCloseSet->contains(cAt(p10))) {
2614 p10 = moveBack(p10);
2615 }
2616 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2617 if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2618 setAppliedRule(p2, "SB10 (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)");
2619 continue;
2620 }
2621 }
2622
2623 int p11 = p1;
2624 if (fSepSet->contains(cAt(p11))) {
2625 p11 = moveBack(p11);
2626 }
2627 while (fSpSet->contains(cAt(p11))) {
2628 p11 = moveBack(p11);
2629 }
2630 while (fCloseSet->contains(cAt(p11))) {
2631 p11 = moveBack(p11);
2632 }
2633 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2634 setAppliedRule(p2, "SB11 (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break>");
2635 break;
2636 }
2637
2638 setAppliedRule(p2, "SB12 Any x Any");
2639 continue;
2640 }
2641
2642 breakPos = p2;
2643 return breakPos;
2644 }
2645
2646 RBBISentMonkey::~RBBISentMonkey() {
2647 delete fSets;
2648 delete fSepSet;
2649 delete fFormatSet;
2650 delete fSpSet;
2651 delete fLowerSet;
2652 delete fUpperSet;
2653 delete fOLetterSet;
2654 delete fNumericSet;
2655 delete fATermSet;
2656 delete fSContinueSet;
2657 delete fSTermSet;
2658 delete fCloseSet;
2659 delete fOtherSet;
2660 delete fExtendSet;
2661 }
2662
2663
2664
2665 //-------------------------------------------------------------------------------------------
2666 //
2667 // RBBILineMonkey
2668 //
2669 //-------------------------------------------------------------------------------------------
2670
2671 class RBBILineMonkey: public RBBIMonkeyKind {
2672 public:
2673 RBBILineMonkey();
2674 virtual ~RBBILineMonkey();
2675 virtual UVector *charClasses();
2676 virtual void setText(const UnicodeString &s);
2677 virtual int32_t next(int32_t i);
2678 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2679 private:
2680 UVector *fSets;
2681
2682 UnicodeSet *fBK;
2683 UnicodeSet *fCR;
2684 UnicodeSet *fLF;
2685 UnicodeSet *fCM;
2686 UnicodeSet *fNL;
2687 UnicodeSet *fSG;
2688 UnicodeSet *fWJ;
2689 UnicodeSet *fZW;
2690 UnicodeSet *fGL;
2691 UnicodeSet *fCB;
2692 UnicodeSet *fSP;
2693 UnicodeSet *fB2;
2694 UnicodeSet *fBA;
2695 UnicodeSet *fBB;
2696 UnicodeSet *fHH;
2697 UnicodeSet *fHY;
2698 UnicodeSet *fH2;
2699 UnicodeSet *fH3;
2700 UnicodeSet *fCL;
2701 UnicodeSet *fCP;
2702 UnicodeSet *fEX;
2703 UnicodeSet *fIN;
2704 UnicodeSet *fJL;
2705 UnicodeSet *fJV;
2706 UnicodeSet *fJT;
2707 UnicodeSet *fNS;
2708 UnicodeSet *fOP;
2709 UnicodeSet *fQU;
2710 UnicodeSet *fIS;
2711 UnicodeSet *fNU;
2712 UnicodeSet *fPO;
2713 UnicodeSet *fPR;
2714 UnicodeSet *fSY;
2715 UnicodeSet *fAI;
2716 UnicodeSet *fAL;
2717 UnicodeSet *fCJ;
2718 UnicodeSet *fHL;
2719 UnicodeSet *fID;
2720 UnicodeSet *fRI;
2721 UnicodeSet *fXX;
2722 UnicodeSet *fEB;
2723 UnicodeSet *fEM;
2724 UnicodeSet *fZWJ;
2725 UnicodeSet *fOP30;
2726 UnicodeSet *fCP30;
2727
2728 BreakIterator *fCharBI;
2729 const UnicodeString *fText;
2730 RegexMatcher *fNumberMatcher;
2731 };
2732
2733 RBBILineMonkey::RBBILineMonkey() :
2734 RBBIMonkeyKind(),
2735 fSets(NULL),
2736
2737 fCharBI(NULL),
2738 fText(NULL),
2739 fNumberMatcher(NULL)
2740
2741 {
2742 if (U_FAILURE(deferredStatus)) {
2743 return;
2744 }
2745
2746 UErrorCode status = U_ZERO_ERROR;
2747
2748 fSets = new UVector(status);
2749
2750 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2751 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2752 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2753 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2754 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2755 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2756 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2757 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2758 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2759 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2760 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2761 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2762 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2763 fHH = new UnicodeSet();
2764 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2765 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2766 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2767 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=CL}] [\\u201D]]"), status); // en adjustments for rdar://problem/51193810
2768 fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
2769 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2770 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2771 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2772 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2773 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2774 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2775 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=OP}] [\\u201C\\u2018]]"), status); // en adjustments
2776 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=QU}]-[\\u201C\\u2018\\u201D]]"), status); // en adjustments
2777 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2778 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2779 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2780 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2781 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2782 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2783 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
2784 fCJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2785 fHL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
2786 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
2787 fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
2788 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2789 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
2790 fEB = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=EB}] \\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F91D\\U0001F93C]"), status);
2791 fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
2792 fZWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
2793 fOP30 = new UnicodeSet(u"[[\\p{Line_break=OP} [\\u201C\\u2018]]-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status); // en adjustments
2794 fCP30 = new UnicodeSet(u"[\\p{Line_break=CP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
2795
2796 if (U_FAILURE(status)) {
2797 deferredStatus = status;
2798 return;
2799 }
2800
2801 fAL->addAll(*fXX); // Default behavior for XX is identical to AL
2802 fAL->addAll(*fAI); // Default behavior for AI is identical to AL
2803 fAL->addAll(*fSG); // Default behavior for SG is identical to AL.
2804
2805 fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS.
2806 fCM->addAll(*fZWJ); // ZWJ behaves as a CM.
2807
2808 fHH->add(u'\u2010'); // Hyphen, '‐'
2809
2810 // Sets and names.
2811 fSets->addElement(fBK, status); classNames.push_back("fBK");
2812 fSets->addElement(fCR, status); classNames.push_back("fCR");
2813 fSets->addElement(fLF, status); classNames.push_back("fLF");
2814 fSets->addElement(fCM, status); classNames.push_back("fCM");
2815 fSets->addElement(fNL, status); classNames.push_back("fNL");
2816 fSets->addElement(fWJ, status); classNames.push_back("fWJ");
2817 fSets->addElement(fZW, status); classNames.push_back("fZW");
2818 fSets->addElement(fGL, status); classNames.push_back("fGL");
2819 fSets->addElement(fCB, status); classNames.push_back("fCB");
2820 fSets->addElement(fSP, status); classNames.push_back("fSP");
2821 fSets->addElement(fB2, status); classNames.push_back("fB2");
2822 fSets->addElement(fBA, status); classNames.push_back("fBA");
2823 fSets->addElement(fBB, status); classNames.push_back("fBB");
2824 fSets->addElement(fHY, status); classNames.push_back("fHY");
2825 fSets->addElement(fH2, status); classNames.push_back("fH2");
2826 fSets->addElement(fH3, status); classNames.push_back("fH3");
2827 fSets->addElement(fCL, status); classNames.push_back("fCL");
2828 fSets->addElement(fCP, status); classNames.push_back("fCP");
2829 fSets->addElement(fEX, status); classNames.push_back("fEX");
2830 fSets->addElement(fIN, status); classNames.push_back("fIN");
2831 fSets->addElement(fJL, status); classNames.push_back("fJL");
2832 fSets->addElement(fJT, status); classNames.push_back("fJT");
2833 fSets->addElement(fJV, status); classNames.push_back("fJV");
2834 fSets->addElement(fNS, status); classNames.push_back("fNS");
2835 fSets->addElement(fOP, status); classNames.push_back("fOP");
2836 fSets->addElement(fQU, status); classNames.push_back("fQU");
2837 fSets->addElement(fIS, status); classNames.push_back("fIS");
2838 fSets->addElement(fNU, status); classNames.push_back("fNU");
2839 fSets->addElement(fPO, status); classNames.push_back("fPO");
2840 fSets->addElement(fPR, status); classNames.push_back("fPR");
2841 fSets->addElement(fSY, status); classNames.push_back("fSY");
2842 fSets->addElement(fAI, status); classNames.push_back("fAI");
2843 fSets->addElement(fAL, status); classNames.push_back("fAL");
2844 fSets->addElement(fHL, status); classNames.push_back("fHL");
2845 fSets->addElement(fID, status); classNames.push_back("fID");
2846 fSets->addElement(fWJ, status); classNames.push_back("fWJ");
2847 fSets->addElement(fRI, status); classNames.push_back("fRI");
2848 fSets->addElement(fSG, status); classNames.push_back("fSG");
2849 fSets->addElement(fEB, status); classNames.push_back("fEB");
2850 fSets->addElement(fEM, status); classNames.push_back("fEM");
2851 fSets->addElement(fZWJ, status); classNames.push_back("fZWJ");
2852 // TODO: fOP30 & fCP30 overlap with plain fOP. Probably OK, but fOP/CP chars will be over-represented.
2853 fSets->addElement(fOP30, status); classNames.push_back("fOP30");
2854 fSets->addElement(fCP30, status); classNames.push_back("fCP30");
2855
2856 const char *rules =
2857 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
2858 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
2859 "((\\p{Line_Break=IS})(\\p{Line_Break=CM}|\\u200d)*)?"
2860 "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
2861 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
2862 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
2863 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
2864
2865 fNumberMatcher = new RegexMatcher(
2866 UnicodeString(rules, -1, US_INV), 0, status);
2867
2868 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2869
2870 if (U_FAILURE(status)) {
2871 deferredStatus = status;
2872 }
2873
2874 }
2875
2876
2877 void RBBILineMonkey::setText(const UnicodeString &s) {
2878 fText = &s;
2879 fCharBI->setText(s);
2880 prepareAppliedRules(s.length());
2881 fNumberMatcher->reset(s);
2882 }
2883
2884 //
2885 // rule9Adjust
2886 // Line Break TR rules 9 and 10 implementation.
2887 // This deals with combining marks and other sequences that
2888 // that must be treated as if they were something other than what they actually are.
2889 //
2890 // This is factored out into a separate function because it must be applied twice for
2891 // each potential break, once to the chars before the position being checked, then
2892 // again to the text following the possible break.
2893 //
2894 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2895 if (pos == -1) {
2896 // Invalid initial position. Happens during the warmup iteration of the
2897 // main loop in next().
2898 return;
2899 }
2900
2901 int32_t nPos = *nextPos;
2902
2903 // LB 9 Keep combining sequences together.
2904 // advance over any CM class chars. Note that Line Break CM is different
2905 // from the normal Grapheme Extend property.
2906 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
2907 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
2908 for (;;) {
2909 *nextChar = fText->char32At(nPos);
2910 if (!fCM->contains(*nextChar)) {
2911 break;
2912 }
2913 nPos = fText->moveIndex32(nPos, 1);
2914 }
2915 }
2916
2917
2918 // LB 9 Treat X CM* as if it were x.
2919 // No explicit action required.
2920
2921 // LB 10 Treat any remaining combining mark as AL
2922 if (fCM->contains(*posChar)) {
2923 *posChar = u'A';
2924 }
2925
2926 // Push the updated nextPos and nextChar back to our caller.
2927 // This only makes a difference if posChar got bigger by consuming a
2928 // combining sequence.
2929 *nextPos = nPos;
2930 *nextChar = fText->char32At(nPos);
2931 }
2932
2933
2934
2935 int32_t RBBILineMonkey::next(int32_t startPos) {
2936 UErrorCode status = U_ZERO_ERROR;
2937 int32_t pos; // Index of the char following a potential break position
2938 UChar32 thisChar; // Character at above position "pos"
2939
2940 int32_t prevPos; // Index of the char preceding a potential break position
2941 UChar32 prevChar; // Character at above position. Note that prevChar
2942 // and thisChar may not be adjacent because combining
2943 // characters between them will be ignored.
2944
2945 int32_t prevPosX2; // Second previous character. Wider context for LB21a.
2946 UChar32 prevCharX2;
2947
2948 int32_t nextPos; // Index of the next character following pos.
2949 // Usually skips over combining marks.
2950 int32_t nextCPPos; // Index of the code point following "pos."
2951 // May point to a combining mark.
2952 int32_t tPos; // temp value.
2953 UChar32 c;
2954
2955 if (U_FAILURE(deferredStatus)) {
2956 return -1;
2957 }
2958
2959 if (startPos >= fText->length()) {
2960 return -1;
2961 }
2962
2963
2964 // Initial values for loop. Loop will run the first time without finding breaks,
2965 // while the invalid values shift out and the "this" and
2966 // "prev" positions are filled in with good values.
2967 pos = prevPos = prevPosX2 = -1; // Invalid value, serves as flag for initial loop iteration.
2968 thisChar = prevChar = prevCharX2 = 0;
2969 nextPos = nextCPPos = startPos;
2970
2971
2972 // Loop runs once per position in the test text, until a break position
2973 // is found.
2974 for (;;) {
2975 prevPosX2 = prevPos;
2976 prevCharX2 = prevChar;
2977
2978 prevPos = pos;
2979 prevChar = thisChar;
2980
2981 pos = nextPos;
2982 thisChar = fText->char32At(pos);
2983
2984 nextCPPos = fText->moveIndex32(pos, 1);
2985 nextPos = nextCPPos;
2986
2987
2988 if (pos >= fText->length()) {
2989 setAppliedRule(pos, "LB2 - Break at end of text.");
2990 break;
2991 }
2992
2993
2994 // We do this one out-of-order because the adjustment does not change anything
2995 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
2996 // be applied.
2997 rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
2998 nextCPPos = nextPos = fText->moveIndex32(pos, 1);
2999 c = fText->char32At(nextPos);
3000 rule9Adjust(pos, &thisChar, &nextPos, &c);
3001
3002 // If the loop is still warming up - if we haven't shifted the initial
3003 // -1 positions out of prevPos yet - loop back to advance the
3004 // position in the input without any further looking for breaks.
3005 if (prevPos == -1) {
3006 setAppliedRule(pos, "LB 9 - adjust for combining sequences.");
3007 continue;
3008 }
3009
3010
3011 if (fBK->contains(prevChar)) {
3012 setAppliedRule(pos, "LB 4 Always break after hard line breaks");
3013 break;
3014 }
3015
3016
3017 if (prevChar == 0x0d && thisChar == 0x0a) {
3018 setAppliedRule(pos, "LB 5 Break after CR, LF, NL, but not inside CR LF");
3019 continue;
3020 }
3021 if (prevChar == 0x0d ||
3022 prevChar == 0x0a ||
3023 prevChar == 0x85) {
3024 setAppliedRule(pos, "LB 5 Break after CR, LF, NL, but not inside CR LF");
3025 break;
3026 }
3027
3028
3029 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3030 fBK->contains(thisChar)) {
3031 setAppliedRule(pos, "LB 6 Don't break before hard line breaks");
3032 continue;
3033 }
3034
3035
3036 if (fSP->contains(thisChar)) {
3037 setAppliedRule(pos, "LB 7 Don't break before spaces or zero-width space.");
3038 continue;
3039 }
3040
3041 // !!! ??? Is this the right text for the applied rule?
3042 if (fZW->contains(thisChar)) {
3043 setAppliedRule(pos, "LB 7 Don't break before spaces or zero-width space.");
3044 continue;
3045 }
3046
3047
3048 // ZW SP* ÷
3049 // Scan backwards from prevChar for SP* ZW
3050 tPos = prevPos;
3051 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3052 tPos = fText->moveIndex32(tPos, -1);
3053 }
3054 if (fZW->contains(fText->char32At(tPos))) {
3055 setAppliedRule(pos, "LB 8 Break after zero width space");
3056 break;
3057 }
3058
3059
3060 // Move this test up, before LB8a, because numbers can match a longer sequence that would
3061 // also match 8a. e.g. NU ZWJ IS PO (ZWJ acts like CM)
3062 if (fNumberMatcher->lookingAt(prevPos, status)) {
3063 if (U_FAILURE(status)) {
3064 setAppliedRule(pos, "LB 25 Numbers");
3065 break;
3066 }
3067 // Matched a number. But could have been just a single digit, which would
3068 // not represent a "no break here" between prevChar and thisChar
3069 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num
3070 if (numEndIdx > pos) {
3071 // Number match includes at least our two chars being checked
3072 if (numEndIdx > nextPos) {
3073 // Number match includes additional chars. Update pos and nextPos
3074 // so that next loop iteration will continue at the end of the number,
3075 // checking for breaks between last char in number & whatever follows.
3076 pos = nextPos = numEndIdx;
3077 do {
3078 pos = fText->moveIndex32(pos, -1);
3079 thisChar = fText->char32At(pos);
3080 } while (fCM->contains(thisChar));
3081 }
3082 setAppliedRule(pos, "LB 25 Numbers");
3083 continue;
3084 }
3085 }
3086
3087
3088 // The monkey test's way of ignoring combining characters doesn't work
3089 // for this rule. ZJ is also a CM. Need to get the actual character
3090 // preceding "thisChar", not ignoring combining marks, possibly ZJ.
3091 {
3092 int32_t prevIdx = fText->moveIndex32(pos, -1);
3093 UChar32 prevC = fText->char32At(prevIdx);
3094 if (fZWJ->contains(prevC)) {
3095 setAppliedRule(pos, "LB 8a ZWJ x");
3096 continue;
3097 }
3098 }
3099
3100
3101 // appliedRule: "LB 9, 10"; // Already done, at top of loop.";
3102 //
3103
3104
3105 // x WJ
3106 // WJ x
3107 //
3108 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3109 setAppliedRule(pos, "LB 11 Do not break before or after WORD JOINER and related characters.");
3110 continue;
3111 }
3112
3113
3114 if (fGL->contains(prevChar)) {
3115 setAppliedRule(pos, "LB 12 GL x");
3116 continue;
3117 }
3118
3119
3120 if (!(fSP->contains(prevChar) ||
3121 fBA->contains(prevChar) ||
3122 fHY->contains(prevChar) ) && fGL->contains(thisChar)) {
3123 setAppliedRule(pos, "LB 12a [^SP BA HY] x GL");
3124 continue;
3125 }
3126
3127
3128 if (fCL->contains(thisChar) ||
3129 fCP->contains(thisChar) ||
3130 fEX->contains(thisChar) ||
3131 fSY->contains(thisChar)) {
3132 setAppliedRule(pos, "LB 13 Don't break before closings.");
3133 continue;
3134 }
3135
3136
3137 // Scan backwards, checking for this sequence.
3138 // The OP char could include combining marks, so we actually check for
3139 // OP CM* SP*
3140 // Another Twist: The Rule 9 fixes may have changed a SP CM
3141 // sequence into a ID char, so before scanning back through spaces,
3142 // verify that prevChar is indeed a space. The prevChar variable
3143 // may differ from fText[prevPos]
3144 tPos = prevPos;
3145 if (fSP->contains(prevChar)) {
3146 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3147 tPos=fText->moveIndex32(tPos, -1);
3148 }
3149 }
3150 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3151 tPos=fText->moveIndex32(tPos, -1);
3152 }
3153 if (fOP->contains(fText->char32At(tPos))) {
3154 setAppliedRule(pos, "LB 14 Don't break after OP SP*");
3155 continue;
3156 }
3157
3158
3159 if (nextPos < fText->length()) {
3160 // note: UnicodeString::char32At(length) returns ffff, not distinguishable
3161 // from a legit ffff character. So test length separately.
3162 UChar32 nextChar = fText->char32At(nextPos);
3163 if (fSP->contains(prevChar) && fIS->contains(thisChar) && fNU->contains(nextChar)) {
3164 setAppliedRule(pos, "LB 14a Break before an IS that begins a number and follows a space");
3165 break;
3166 }
3167 }
3168
3169
3170 if (fIS->contains(thisChar)) {
3171 setAppliedRule(pos, "LB 14b Do not break before numeric separators, even after spaces.");
3172 continue;
3173 }
3174
3175
3176 if (fOP->contains(thisChar)) {
3177 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3178 int tPos = prevPos;
3179 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3180 tPos = fText->moveIndex32(tPos, -1);
3181 }
3182 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3183 tPos = fText->moveIndex32(tPos, -1);
3184 }
3185 if (fQU->contains(fText->char32At(tPos))) {
3186 setAppliedRule(pos, "LB 15 QU SP* x OP");
3187 continue;
3188 }
3189 }
3190
3191
3192 // Scan backwards for SP* CM* (CL | CP)
3193 if (fNS->contains(thisChar)) {
3194 int tPos = prevPos;
3195 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3196 tPos = fText->moveIndex32(tPos, -1);
3197 }
3198 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3199 tPos = fText->moveIndex32(tPos, -1);
3200 }
3201 if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3202 setAppliedRule(pos, "LB 16 (CL | CP) SP* x NS");
3203 continue;
3204 }
3205 }
3206
3207
3208 if (fB2->contains(thisChar)) {
3209 // Scan backwards, checking for the B2 CM* SP* sequence.
3210 tPos = prevPos;
3211 if (fSP->contains(prevChar)) {
3212 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3213 tPos=fText->moveIndex32(tPos, -1);
3214 }
3215 }
3216 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3217 tPos=fText->moveIndex32(tPos, -1);
3218 }
3219 if (fB2->contains(fText->char32At(tPos))) {
3220 setAppliedRule(pos, "LB 17 B2 SP* x B2");
3221 continue;
3222 }
3223 }
3224
3225
3226 if (fSP->contains(prevChar)) {
3227 setAppliedRule(pos, "LB 18 break after space");
3228 break;
3229 }
3230
3231 // x QU
3232 // QU x
3233 if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3234 setAppliedRule(pos, "LB 19");
3235 continue;
3236 }
3237
3238 if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3239 setAppliedRule(pos, "LB 20 Break around a CB");
3240 break;
3241 }
3242
3243 // Don't break between Hyphens and letters if a break precedes the hyphen.
3244 // Formerly this was a Finnish tailoring.
3245 // Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
3246 // ^($HY | $HH) $AL;
3247 if (fAL->contains(thisChar) && (fHY->contains(prevChar) || fHH->contains(prevChar)) &&
3248 prevPosX2 == -1) {
3249 setAppliedRule(pos, "LB 20.09");
3250 continue;
3251 }
3252
3253 if (fBA->contains(thisChar) ||
3254 fHY->contains(thisChar) ||
3255 fNS->contains(thisChar) ||
3256 fBB->contains(prevChar) ) {
3257 setAppliedRule(pos, "LB 21");
3258 continue;
3259 }
3260
3261 if (fHL->contains(prevCharX2) &&
3262 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3263 setAppliedRule(pos, "LB 21a HL (HY | BA) x");
3264 continue;
3265 }
3266
3267 if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3268 setAppliedRule(pos, "LB 21b SY x HL");
3269 continue;
3270 }
3271
3272 if (fIN->contains(thisChar)) {
3273 setAppliedRule(pos, "LB 22");
3274 continue;
3275 }
3276
3277
3278 // (AL | HL) x NU
3279 // NU x (AL | HL)
3280 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
3281 setAppliedRule(pos, "LB 23");
3282 continue;
3283 }
3284 if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3285 setAppliedRule(pos, "LB 23");
3286 continue;
3287 }
3288
3289 // Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
3290 // PR x (ID | EB | EM)
3291 // (ID | EB | EM) x PO
3292 if (fPR->contains(prevChar) &&
3293 (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar))) {
3294 setAppliedRule(pos, "LB 23a");
3295 continue;
3296 }
3297 if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
3298 fPO->contains(thisChar)) {
3299 setAppliedRule(pos, "LB 23a");
3300 continue;
3301 }
3302
3303 // Do not break between prefix and letters or ideographs.
3304 // (PR | PO) x (AL | HL)
3305 // (AL | HL) x (PR | PO)
3306 if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
3307 (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3308 setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
3309 continue;
3310 }
3311 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
3312 (fPR->contains(thisChar) || fPO->contains(thisChar))) {
3313 setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
3314 continue;
3315 }
3316
3317 // appliedRule: "LB 25 numbers match"; // moved up, before LB 8a,
3318
3319 if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3320 fJV->contains(thisChar) ||
3321 fH2->contains(thisChar) ||
3322 fH3->contains(thisChar))) {
3323 setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3324 continue;
3325 }
3326
3327 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) &&
3328 (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3329 setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3330 continue;
3331 }
3332
3333 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3334 fJT->contains(thisChar)) {
3335 setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3336 continue;
3337 }
3338
3339 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3340 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3341 fIN->contains(thisChar)) {
3342 setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3343 continue;
3344 }
3345 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3346 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3347 fPO->contains(thisChar)) {
3348 setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3349 continue;
3350 }
3351 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3352 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3353 setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3354 continue;
3355 }
3356
3357
3358
3359 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3360 setAppliedRule(pos, "LB 28 Do not break between alphabetics (\"at\").");
3361 continue;
3362 }
3363
3364 if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3365 setAppliedRule(pos, "LB 29 Do not break between numeric punctuation and alphabetics (\"e.g.\").");
3366 continue;
3367 }
3368
3369 // (AL | NU) x OP
3370 // CP x (AL | NU)
3371 if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP30->contains(thisChar)) {
3372 setAppliedRule(pos, "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3373 continue;
3374 }
3375 if (fCP30->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3376 setAppliedRule(pos, "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3377 continue;
3378 }
3379
3380 // RI x RI
3381 if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
3382 setAppliedRule(pos, "LB30a RI RI ÷ RI");
3383 break;
3384 }
3385 if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3386 // Two Regional Indicators have been paired.
3387 // Over-write the trailing one (thisChar) to prevent it from forming another pair with a
3388 // following RI. This is a hack.
3389 thisChar = -1;
3390 setAppliedRule(pos, "LB30a RI RI ÷ RI");
3391 continue;
3392 }
3393
3394 if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
3395 setAppliedRule(pos, "LB30b Emoji Base x Emoji Modifier");
3396 continue;
3397 }
3398
3399 setAppliedRule(pos, "LB 31 Break everywhere else");
3400 break;
3401 }
3402
3403 return pos;
3404 }
3405
3406
3407 UVector *RBBILineMonkey::charClasses() {
3408 return fSets;
3409 }
3410
3411
3412 RBBILineMonkey::~RBBILineMonkey() {
3413 delete fSets;
3414
3415 delete fBK;
3416 delete fCR;
3417 delete fLF;
3418 delete fCM;
3419 delete fNL;
3420 delete fWJ;
3421 delete fZW;
3422 delete fGL;
3423 delete fCB;
3424 delete fSP;
3425 delete fB2;
3426 delete fBA;
3427 delete fBB;
3428 delete fHH;
3429 delete fHY;
3430 delete fH2;
3431 delete fH3;
3432 delete fCL;
3433 delete fCP;
3434 delete fEX;
3435 delete fIN;
3436 delete fJL;
3437 delete fJV;
3438 delete fJT;
3439 delete fNS;
3440 delete fOP;
3441 delete fQU;
3442 delete fIS;
3443 delete fNU;
3444 delete fPO;
3445 delete fPR;
3446 delete fSY;
3447 delete fAI;
3448 delete fAL;
3449 delete fCJ;
3450 delete fHL;
3451 delete fID;
3452 delete fRI;
3453 delete fSG;
3454 delete fXX;
3455 delete fEB;
3456 delete fEM;
3457 delete fZWJ;
3458 delete fOP30;
3459 delete fCP30;
3460
3461 delete fCharBI;
3462 delete fNumberMatcher;
3463 }
3464
3465
3466 //-------------------------------------------------------------------------------------------
3467 //
3468 // TestMonkey
3469 //
3470 // params
3471 // seed=nnnnn Random number starting seed.
3472 // Setting the seed allows errors to be reproduced.
3473 // loop=nnn Looping count. Controls running time.
3474 // -1: run forever.
3475 // 0 or greater: run length.
3476 //
3477 // type = char | word | line | sent | title
3478 //
3479 // Example:
3480 // intltest rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3481 //
3482 //-------------------------------------------------------------------------------------------
3483
3484 static int32_t getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3485 int32_t val = defaultVal;
3486 name.append(" *= *(-?\\d+)");
3487 UErrorCode status = U_ZERO_ERROR;
3488 RegexMatcher m(name, params, 0, status);
3489 if (m.find()) {
3490 // The param exists. Convert the string to an int.
3491 char valString[100];
3492 int32_t paramLength = m.end(1, status) - m.start(1, status);
3493 if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3494 paramLength = (int32_t)(sizeof(valString)-2);
3495 }
3496 params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3497 val = strtol(valString, NULL, 10);
3498
3499 // Delete this parameter from the params string.
3500 m.reset();
3501 params = m.replaceFirst("", status);
3502 }
3503 U_ASSERT(U_SUCCESS(status));
3504 return val;
3505 }
3506 #endif
3507
3508 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3509 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3510 BreakIterator *bi,
3511 int expected[],
3512 int expectedcount)
3513 {
3514 int count = 0;
3515 int i = 0;
3516 int forward[50];
3517 bi->setText(ustr);
3518 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3519 forward[count] = i;
3520 if (count < expectedcount && expected[count] != i) {
3521 test->errln("%s:%d break forward test failed: expected %d but got %d",
3522 __FILE__, __LINE__, expected[count], i);
3523 break;
3524 }
3525 count ++;
3526 }
3527 if (count != expectedcount) {
3528 printStringBreaks(ustr, expected, expectedcount);
3529 test->errln("%s:%d break forward test failed: missed %d match",
3530 __FILE__, __LINE__, expectedcount - count);
3531 return;
3532 }
3533 // testing boundaries
3534 for (i = 1; i < expectedcount; i ++) {
3535 int j = expected[i - 1];
3536 if (!bi->isBoundary(j)) {
3537 printStringBreaks(ustr, expected, expectedcount);
3538 test->errln("%s:%d isBoundary() failed. Expected boundary at position %d",
3539 __FILE__, __LINE__, j);
3540 return;
3541 }
3542 for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3543 if (bi->isBoundary(j)) {
3544 printStringBreaks(ustr, expected, expectedcount);
3545 test->errln("%s:%d isBoundary() failed. Not expecting boundary at position %d",
3546 __FILE__, __LINE__, j);
3547 return;
3548 }
3549 }
3550 }
3551
3552 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3553 count --;
3554 if (forward[count] != i) {
3555 printStringBreaks(ustr, expected, expectedcount);
3556 test->errln("%s:%d happy break test previous() failed: expected %d but got %d",
3557 __FILE__, __LINE__, forward[count], i);
3558 break;
3559 }
3560 }
3561 if (count != 0) {
3562 printStringBreaks(ustr, expected, expectedcount);
3563 test->errln("break test previous() failed: missed a match");
3564 return;
3565 }
3566
3567 // testing preceding
3568 for (i = 0; i < expectedcount - 1; i ++) {
3569 // int j = expected[i] + 1;
3570 int j = ustr.moveIndex32(expected[i], 1);
3571 for (; j <= expected[i + 1]; j ++) {
3572 int32_t expectedPreceding = expected[i];
3573 int32_t actualPreceding = bi->preceding(j);
3574 if (actualPreceding != expectedPreceding) {
3575 printStringBreaks(ustr, expected, expectedcount);
3576 test->errln("%s:%d preceding(%d): expected %d, got %d",
3577 __FILE__, __LINE__, j, expectedPreceding, actualPreceding);
3578 return;
3579 }
3580 }
3581 }
3582 }
3583 #endif
3584
3585 void RBBITest::TestWordBreaks(void)
3586 {
3587 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3588
3589 Locale locale("en");
3590 UErrorCode status = U_ZERO_ERROR;
3591 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3592 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3593 // Replaced any C+J characters in a row with a random sequence of characters
3594 // of the same length to make our C+J segmentation not get in the way.
3595 static const char *strlist[] =
3596 {
3597 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3598 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3599 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3600 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3601 "\\uac00\\u3588\\u009c\\u0953\\u194b",
3602 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3603 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3604 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3605 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3606 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3607 "\\u2027\\U000e0067\\u0a47\\u00b7",
3608 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3609 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3610 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3611 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3612 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3613 "\\u0027\\u11af\\U000e0057\\u0602",
3614 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3615 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3616 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3617 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3618 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3619 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3620 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3621 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3622 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3623 "\\u18f4\\U000e0049\\u20e7\\u2027",
3624 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3625 "\\ua183\\u102d\\u0bec\\u003a",
3626 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3627 "\\u003a\\u0e57\\u0fad\\u002e",
3628 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3629 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3630 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3631 "\\u003a\\u0664\\u00b7\\u1fba",
3632 "\\u003b\\u0027\\u00b7\\u47a3",
3633 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3634 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3635 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3636 };
3637 int loop;
3638 if (U_FAILURE(status)) {
3639 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3640 return;
3641 }
3642 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3643 // printf("looping %d\n", loop);
3644 UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3645 // RBBICharMonkey monkey;
3646 RBBIWordMonkey monkey;
3647
3648 int expected[50];
3649 int expectedcount = 0;
3650
3651 monkey.setText(ustr);
3652 int i;
3653 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3654 expected[expectedcount ++] = i;
3655 }
3656
3657 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3658 }
3659 delete bi;
3660 #endif
3661 }
3662
3663 void RBBITest::TestWordBoundary(void)
3664 {
3665 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3666 Locale locale("en");
3667 UErrorCode status = U_ZERO_ERROR;
3668 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3669 LocalPointer<BreakIterator> bi(BreakIterator::createWordInstance(locale, status), status);
3670 if (U_FAILURE(status)) {
3671 errcheckln(status, "%s:%d Creation of break iterator failed %s",
3672 __FILE__, __LINE__, u_errorName(status));
3673 return;
3674 }
3675 UChar str[50];
3676 static const char *strlist[] =
3677 {
3678 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3679 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3680 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3681 "\\u2027\\U000e0067\\u0a47\\u00b7",
3682 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3683 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3684 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3685 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3686 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3687 "\\u0027\\u11af\\U000e0057\\u0602",
3688 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3689 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3690 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3691 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3692 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3693 "\\U000e0065\\u302c\\u09ee\\U000e0068",
3694 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3695 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3696 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3697 "\\u58f4\\U000e0049\\u20e7\\u2027",
3698 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3699 "\\ua183\\u102d\\u0bec\\u003a",
3700 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3701 "\\u003a\\u0e57\\u0fad\\u002e",
3702 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3703 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3704 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3705 "\\u003a\\u0664\\u00b7\\u1fba",
3706 "\\u003b\\u0027\\u00b7\\u47a3",
3707 };
3708 int loop;
3709 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3710 u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3711 UnicodeString ustr(str);
3712 int forward[50];
3713 int count = 0;
3714
3715 bi->setText(ustr);
3716 int prev = -1;
3717 for (int32_t boundary = bi->first(); boundary != BreakIterator::DONE; boundary = bi->next()) {
3718 ++count;
3719 if (count >= UPRV_LENGTHOF(forward)) {
3720 errln("%s:%d too many breaks found. (loop, count, boundary) = (%d, %d, %d)",
3721 __FILE__, __LINE__, loop, count, boundary);
3722 return;
3723 }
3724 forward[count] = boundary;
3725 if (boundary <= prev) {
3726 errln("%s:%d bi::next() did not advance. (loop, prev, boundary) = (%d, %d, %d)\n",
3727 __FILE__, __LINE__, loop, prev, boundary);
3728 break;
3729 }
3730 for (int32_t nonBoundary = prev + 1; nonBoundary < boundary; nonBoundary ++) {
3731 if (bi->isBoundary(nonBoundary)) {
3732 printStringBreaks(ustr, forward, count);
3733 errln("%s:%d isBoundary(nonBoundary) failed. (loop, prev, nonBoundary, boundary) = (%d, %d, %d, %d)",
3734 __FILE__, __LINE__, loop, prev, nonBoundary, boundary);
3735 return;
3736 }
3737 }
3738 if (!bi->isBoundary(boundary)) {
3739 printStringBreaks(ustr, forward, count);
3740 errln("%s:%d happy boundary test failed: expected %d a boundary",
3741 __FILE__, __LINE__, boundary);
3742 return;
3743 }
3744 prev = boundary;
3745 }
3746 }
3747 }
3748
3749 void RBBITest::TestLineBreaks(void)
3750 {
3751 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3752 Locale locale("en");
3753 UErrorCode status = U_ZERO_ERROR;
3754 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3755 const int32_t STRSIZE = 50;
3756 UChar str[STRSIZE];
3757 static const char *strlist[] =
3758 {
3759 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3760 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3761 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3762 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3763 "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3764 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3765 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3766 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3767 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3768 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3769 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3770 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3771 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3772 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3773 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3774 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3775 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3776 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3777 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3778 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3779 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3780 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3781 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3782 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3783 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3784 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3785 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3786 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3787 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3788 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3789 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3790 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3791 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3792 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3793 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3794 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3795 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3796 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3797 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3798 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3799 };
3800 int loop;
3801 TEST_ASSERT_SUCCESS(status);
3802 if (U_FAILURE(status)) {
3803 return;
3804 }
3805 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3806 // printf("looping %d\n", loop);
3807 int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3808 if (t >= STRSIZE) {
3809 TEST_ASSERT(FALSE);
3810 continue;
3811 }
3812
3813
3814 UnicodeString ustr(str);
3815 RBBILineMonkey monkey;
3816 if (U_FAILURE(monkey.deferredStatus)) {
3817 continue;
3818 }
3819
3820 const int EXPECTEDSIZE = 50;
3821 int expected[EXPECTEDSIZE];
3822 int expectedcount = 0;
3823
3824 monkey.setText(ustr);
3825
3826 int i;
3827 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3828 if (expectedcount >= EXPECTEDSIZE) {
3829 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3830 return;
3831 }
3832 expected[expectedcount ++] = i;
3833 }
3834
3835 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3836 }
3837 delete bi;
3838 #endif
3839 }
3840
3841 void RBBITest::TestSentBreaks(void)
3842 {
3843 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3844 Locale locale("en");
3845 UErrorCode status = U_ZERO_ERROR;
3846 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3847 UChar str[200];
3848 static const char *strlist[] =
3849 {
3850 "Now\ris\nthe\r\ntime\n\rfor\r\r",
3851 "This\n",
3852 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3853 "\"Sentence ending with a quote.\" Bye.",
3854 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
3855 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3856 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3857 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3858 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3859 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3860 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3861 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3862 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3863 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3864 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3865 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3866 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3867 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3868 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3869 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3870 };
3871 int loop;
3872 if (U_FAILURE(status)) {
3873 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3874 return;
3875 }
3876 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3877 u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3878 UnicodeString ustr(str);
3879
3880 RBBISentMonkey monkey;
3881 if (U_FAILURE(monkey.deferredStatus)) {
3882 continue;
3883 }
3884
3885 const int EXPECTEDSIZE = 50;
3886 int expected[EXPECTEDSIZE];
3887 int expectedcount = 0;
3888
3889 monkey.setText(ustr);
3890
3891 int i;
3892 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3893 if (expectedcount >= EXPECTEDSIZE) {
3894 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3895 return;
3896 }
3897 expected[expectedcount ++] = i;
3898 }
3899
3900 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3901 }
3902 delete bi;
3903 #endif
3904 }
3905
3906 void RBBITest::TestMonkey() {
3907 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3908
3909 UErrorCode status = U_ZERO_ERROR;
3910 int32_t loopCount = 500;
3911 int32_t seed = 1;
3912 UnicodeString breakType = "all";
3913 Locale locale("en");
3914 UBool useUText = FALSE;
3915
3916 if (quick == FALSE) {
3917 loopCount = 10000;
3918 }
3919
3920 if (fTestParams) {
3921 UnicodeString p(fTestParams);
3922 loopCount = getIntParam("loop", p, loopCount);
3923 seed = getIntParam("seed", p, seed);
3924
3925 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3926 if (m.find()) {
3927 breakType = m.group(1, status);
3928 m.reset();
3929 p = m.replaceFirst("", status);
3930 }
3931
3932 RegexMatcher u(" *utext", p, 0, status);
3933 if (u.find()) {
3934 useUText = TRUE;
3935 u.reset();
3936 p = u.replaceFirst("", status);
3937 }
3938
3939
3940 // m.reset(p);
3941 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
3942 // Each option is stripped out of the option string as it is processed.
3943 // All options have been checked. The option string should have been completely emptied..
3944 char buf[100];
3945 p.extract(buf, sizeof(buf), NULL, status);
3946 buf[sizeof(buf)-1] = 0;
3947 errln("Unrecognized or extra parameter: %s\n", buf);
3948 return;
3949 }
3950
3951 }
3952
3953 if (breakType == "char" || breakType == "all") {
3954 RBBICharMonkey m;
3955 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3956 if (U_SUCCESS(status)) {
3957 RunMonkey(bi, m, "char", seed, loopCount, useUText);
3958 if (breakType == "all" && useUText==FALSE) {
3959 // Also run a quick test with UText when "all" is specified
3960 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
3961 }
3962 }
3963 else {
3964 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
3965 }
3966 delete bi;
3967 }
3968
3969 if (breakType == "word" || breakType == "all") {
3970 logln("Word Break Monkey Test");
3971 RBBIWordMonkey m;
3972 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3973 if (U_SUCCESS(status)) {
3974 RunMonkey(bi, m, "word", seed, loopCount, useUText);
3975 }
3976 else {
3977 errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
3978 }
3979 delete bi;
3980 }
3981
3982 if (breakType == "line" || breakType == "all") {
3983 logln("Line Break Monkey Test");
3984 RBBILineMonkey m;
3985 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3986 if (loopCount >= 10) {
3987 loopCount = loopCount / 5; // Line break runs slower than the others.
3988 }
3989 if (U_SUCCESS(status)) {
3990 RunMonkey(bi, m, "line", seed, loopCount, useUText);
3991 }
3992 else {
3993 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3994 }
3995 delete bi;
3996 }
3997
3998 if (breakType == "sent" || breakType == "all" ) {
3999 logln("Sentence Break Monkey Test");
4000 RBBISentMonkey m;
4001 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
4002 if (loopCount >= 10) {
4003 loopCount = loopCount / 10; // Sentence runs slower than the other break types
4004 }
4005 if (U_SUCCESS(status)) {
4006 RunMonkey(bi, m, "sent", seed, loopCount, useUText);
4007 }
4008 else {
4009 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4010 }
4011 delete bi;
4012 }
4013
4014 #endif
4015 }
4016
4017 //
4018 // Run a RBBI monkey test. Common routine, for all break iterator types.
4019 // Parameters:
4020 // bi - the break iterator to use
4021 // mk - MonkeyKind, abstraction for obtaining expected results
4022 // name - Name of test (char, word, etc.) for use in error messages
4023 // seed - Seed for starting random number generator (parameter from user)
4024 // numIterations
4025 //
4026 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed,
4027 int32_t numIterations, UBool useUText) {
4028
4029 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4030
4031 const int32_t TESTSTRINGLEN = 500;
4032 UnicodeString testText;
4033 int32_t numCharClasses;
4034 UVector *chClasses;
4035 int expectedCount = 0;
4036 char expectedBreaks[TESTSTRINGLEN*2 + 1];
4037 char forwardBreaks[TESTSTRINGLEN*2 + 1];
4038 char reverseBreaks[TESTSTRINGLEN*2+1];
4039 char isBoundaryBreaks[TESTSTRINGLEN*2+1];
4040 char followingBreaks[TESTSTRINGLEN*2+1];
4041 char precedingBreaks[TESTSTRINGLEN*2+1];
4042 int i;
4043 int loopCount = 0;
4044
4045
4046 m_seed = seed;
4047
4048 numCharClasses = mk.charClasses()->size();
4049 chClasses = mk.charClasses();
4050
4051 // Check for errors that occured during the construction of the MonkeyKind object.
4052 // Can't report them where they occured because errln() is a method coming from intlTest,
4053 // and is not visible outside of RBBITest :-(
4054 if (U_FAILURE(mk.deferredStatus)) {
4055 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
4056 return;
4057 }
4058
4059 // Verify that the character classes all have at least one member.
4060 for (i=0; i<numCharClasses; i++) {
4061 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
4062 if (s == NULL || s->size() == 0) {
4063 errln("Character Class #%d is null or of zero size.", i);
4064 return;
4065 }
4066 }
4067
4068 // For minimizing width of class name output.
4069 int classNameSize = mk.maxClassNameSize();
4070
4071 while (loopCount < numIterations || numIterations == -1) {
4072 if (numIterations == -1 && loopCount % 10 == 0) {
4073 // If test is running in an infinite loop, display a periodic tic so
4074 // we can tell that it is making progress.
4075 fprintf(stderr, ".");
4076 }
4077 // Save current random number seed, so that we can recreate the random numbers
4078 // for this loop iteration in event of an error.
4079 seed = m_seed;
4080
4081 // Populate a test string with data.
4082 testText.truncate(0);
4083 for (i=0; i<TESTSTRINGLEN; i++) {
4084 int32_t aClassNum = m_rand() % numCharClasses;
4085 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4086 int32_t charIdx = m_rand() % classSet->size();
4087 UChar32 c = classSet->charAt(charIdx);
4088 if (c < 0) { // TODO: deal with sets containing strings.
4089 errln("%s:%d c < 0", __FILE__, __LINE__);
4090 break;
4091 }
4092 // Do not assemble a supplementary character from randomly generated separate surrogates.
4093 // (It could be a dictionary character)
4094 if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
4095 continue;
4096 }
4097
4098 testText.append(c);
4099 }
4100
4101 // Calculate the expected results for this test string and reset applied rules.
4102 mk.setText(testText);
4103
4104 memset(expectedBreaks, 0, sizeof(expectedBreaks));
4105 expectedBreaks[0] = 1;
4106 int32_t breakPos = 0;
4107 expectedCount = 0;
4108 for (;;) {
4109 breakPos = mk.next(breakPos);
4110 if (breakPos == -1) {
4111 break;
4112 }
4113 if (breakPos > testText.length()) {
4114 errln("breakPos > testText.length()");
4115 }
4116 expectedBreaks[breakPos] = 1;
4117 U_ASSERT(expectedCount<testText.length());
4118 }
4119
4120 // Find the break positions using forward iteration
4121 memset(forwardBreaks, 0, sizeof(forwardBreaks));
4122 if (useUText) {
4123 UErrorCode status = U_ZERO_ERROR;
4124 UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4125 // testUText = utext_openUnicodeString(testUText, &testText, &status);
4126 bi->setText(testUText, status);
4127 TEST_ASSERT_SUCCESS(status);
4128 utext_close(testUText); // The break iterator does a shallow clone of the UText
4129 // This UText can be closed immediately, so long as the
4130 // testText string continues to exist.
4131 } else {
4132 bi->setText(testText);
4133 }
4134
4135 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4136 if (i < 0 || i > testText.length()) {
4137 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4138 break;
4139 }
4140 forwardBreaks[i] = 1;
4141 }
4142
4143 // Find the break positions using reverse iteration
4144 memset(reverseBreaks, 0, sizeof(reverseBreaks));
4145 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4146 if (i < 0 || i > testText.length()) {
4147 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4148 break;
4149 }
4150 reverseBreaks[i] = 1;
4151 }
4152
4153 // Find the break positions using isBoundary() tests.
4154 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4155 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4156 for (i=0; i<=testText.length(); i++) {
4157 isBoundaryBreaks[i] = bi->isBoundary(i);
4158 }
4159
4160
4161 // Find the break positions using the following() function.
4162 // printf(".");
4163 memset(followingBreaks, 0, sizeof(followingBreaks));
4164 int32_t lastBreakPos = 0;
4165 followingBreaks[0] = 1;
4166 for (i=0; i<testText.length(); i++) {
4167 breakPos = bi->following(i);
4168 if (breakPos <= i ||
4169 breakPos < lastBreakPos ||
4170 breakPos > testText.length() ||
4171 (breakPos > lastBreakPos && lastBreakPos > i)) {
4172 errln("%s break monkey test: "
4173 "Out of range value returned by BreakIterator::following().\n"
4174 "Random seed=%d index=%d; following returned %d; lastbreak=%d",
4175 name, seed, i, breakPos, lastBreakPos);
4176 break;
4177 }
4178 followingBreaks[breakPos] = 1;
4179 lastBreakPos = breakPos;
4180 }
4181
4182 // Find the break positions using the preceding() function.
4183 memset(precedingBreaks, 0, sizeof(precedingBreaks));
4184 lastBreakPos = testText.length();
4185 precedingBreaks[testText.length()] = 1;
4186 for (i=testText.length(); i>0; i--) {
4187 breakPos = bi->preceding(i);
4188 if (breakPos >= i ||
4189 breakPos > lastBreakPos ||
4190 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4191 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4192 errln("%s break monkey test: "
4193 "Out of range value returned by BreakIterator::preceding().\n"
4194 "index=%d; prev returned %d; lastBreak=%d" ,
4195 name, i, breakPos, lastBreakPos);
4196 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4197 precedingBreaks[i] = 2; // Forces an error.
4198 }
4199 } else {
4200 if (breakPos >= 0) {
4201 precedingBreaks[breakPos] = 1;
4202 }
4203 lastBreakPos = breakPos;
4204 }
4205 }
4206
4207 // Compare the expected and actual results.
4208 for (i=0; i<=testText.length(); i++) {
4209 const char *errorType = NULL;
4210 const char* currentBreakData = NULL;
4211 if (forwardBreaks[i] != expectedBreaks[i]) {
4212 errorType = "next()";
4213 currentBreakData = forwardBreaks;
4214 } else if (reverseBreaks[i] != forwardBreaks[i]) {
4215 errorType = "previous()";
4216 currentBreakData = reverseBreaks;
4217 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4218 errorType = "isBoundary()";
4219 currentBreakData = isBoundaryBreaks;
4220 } else if (followingBreaks[i] != expectedBreaks[i]) {
4221 errorType = "following()";
4222 currentBreakData = followingBreaks;
4223 } else if (precedingBreaks[i] != expectedBreaks[i]) {
4224 errorType = "preceding()";
4225 currentBreakData = precedingBreaks;
4226 }
4227
4228 if (errorType != NULL) {
4229 // Format a range of the test text that includes the failure as
4230 // a data item that can be included in the rbbi test data file.
4231
4232 // Start of the range is the last point where expected and actual results
4233 // both agreed that there was a break position.
4234
4235 int startContext = i;
4236 int32_t count = 0;
4237 for (;;) {
4238 if (startContext==0) { break; }
4239 startContext --;
4240 if (expectedBreaks[startContext] != 0) {
4241 if (count == 2) break;
4242 count ++;
4243 }
4244 }
4245
4246 // End of range is two expected breaks past the start position.
4247 int endContext = i + 1;
4248 int ci;
4249 for (ci=0; ci<2; ci++) { // Number of items to include in error text.
4250 for (;;) {
4251 if (endContext >= testText.length()) {break;}
4252 if (expectedBreaks[endContext-1] != 0) {
4253 if (count == 0) break;
4254 count --;
4255 }
4256 endContext ++;
4257 }
4258 }
4259
4260 // Formatting of each line includes:
4261 // character code
4262 // reference break: '|' -> a break, '.' -> no break
4263 // actual break: '|' -> a break, '.' -> no break
4264 // (name of character clase)
4265 // Unicode name of character
4266 // '-->' indicates location of the difference.
4267
4268 MONKEY_ERROR(
4269 (expectedBreaks[i] ? "Break expected but not found" :
4270 "Break found but not expected"),
4271 name, i, seed);
4272
4273 for (ci=startContext; (ci = testText.moveIndex32(ci, 1));) {
4274 UChar32 c;
4275 c = testText.char32At(ci);
4276
4277 std::string currentLineFlag = " ";
4278 if (ci == i) {
4279 currentLineFlag = "-->"; // Error position
4280 }
4281
4282 // BMP or SMP character in hex
4283 char hexCodePoint[12];
4284 std::string format = " \\u%04x";
4285 if (c >= 0x10000) {
4286 format = "\\U%08x";
4287 }
4288 sprintf(hexCodePoint, format.c_str(), c);
4289
4290 // Get the class name and character name for the character.
4291 char cName[200];
4292 UErrorCode status = U_ZERO_ERROR;
4293 u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status);
4294
4295 char buffer[200];
4296 snprintf(buffer, 200,
4297 "%4s %3i : %1s %1s %10s %-*s %-40s %-40s",
4298 currentLineFlag.c_str(),
4299 ci,
4300 expectedBreaks[ci] == 0 ? "." : "|", // Reference break
4301 currentBreakData[ci] == 0 ? "." : "|", // Actual break
4302 hexCodePoint,
4303 classNameSize,
4304 mk.classNameFromCodepoint(c).c_str(),
4305 mk.getAppliedRule(ci).c_str(), cName);
4306
4307 // Output the error
4308 if (ci == i) {
4309 errln(buffer);
4310 } else {
4311 infoln(buffer);
4312 }
4313
4314 if (ci >= endContext) { break; }
4315 }
4316 break;
4317 }
4318 }
4319
4320 loopCount++;
4321 }
4322 #endif
4323 }
4324
4325
4326 // Bug 5532. UTF-8 based UText fails in dictionary code.
4327 // This test checks the initial patch,
4328 // which is to just keep it from crashing. Correct word boundaries
4329 // await a proper fix to the dictionary code.
4330 //
4331 void RBBITest::TestBug5532(void) {
4332 // Text includes a mixture of Thai and Latin.
4333 const unsigned char utf8Data[] = {
4334 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4335 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4336 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4337 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4338 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4339 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4340 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4341 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4342 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4343 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4344 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4345
4346 UErrorCode status = U_ZERO_ERROR;
4347 UText utext=UTEXT_INITIALIZER;
4348 utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4349 TEST_ASSERT_SUCCESS(status);
4350
4351 BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4352 TEST_ASSERT_SUCCESS(status);
4353 if (U_SUCCESS(status)) {
4354 bi->setText(&utext, status);
4355 TEST_ASSERT_SUCCESS(status);
4356
4357 int32_t breakCount = 0;
4358 int32_t previousBreak = -1;
4359 for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4360 // For now, just make sure that the break iterator doesn't hang.
4361 TEST_ASSERT(previousBreak < bi->current());
4362 previousBreak = bi->current();
4363 }
4364 TEST_ASSERT(breakCount > 0);
4365 }
4366 delete bi;
4367 utext_close(&utext);
4368 }
4369
4370
4371 void RBBITest::TestBug9983(void) {
4372 UnicodeString text = UnicodeString("\\u002A" // * Other
4373 "\\uFF65" // Other
4374 "\\u309C" // Katakana
4375 "\\uFF9F" // Extend
4376 "\\uFF65" // Other
4377 "\\u0020" // Other
4378 "\\u0000").unescape();
4379
4380 UErrorCode status = U_ZERO_ERROR;
4381 LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4382 BreakIterator::createWordInstance(Locale::getRoot(), status)));
4383 TEST_ASSERT_SUCCESS(status);
4384 LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4385 BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4386 TEST_ASSERT_SUCCESS(status);
4387 if (U_FAILURE(status)) {
4388 return;
4389 }
4390 int32_t offset, rstatus, iterationCount;
4391
4392 brkiter->setText(text);
4393 brkiter->last();
4394 iterationCount = 0;
4395 while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4396 iterationCount++;
4397 rstatus = brkiter->getRuleStatus();
4398 (void)rstatus; // Suppress set but not used warning.
4399 if (iterationCount >= 10) {
4400 break;
4401 }
4402 }
4403 TEST_ASSERT(iterationCount == 6);
4404
4405 brkiterPOSIX->setText(text);
4406 brkiterPOSIX->last();
4407 iterationCount = 0;
4408 while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4409 iterationCount++;
4410 rstatus = brkiterPOSIX->getRuleStatus();
4411 (void)rstatus; // Suppress set but not used warning.
4412 if (iterationCount >= 10) {
4413 break;
4414 }
4415 }
4416 TEST_ASSERT(iterationCount == 6);
4417 }
4418
4419 // Bug 7547 - verify that building a break itereator from empty rules produces an error.
4420 //
4421 void RBBITest::TestBug7547() {
4422 UnicodeString rules;
4423 UErrorCode status = U_ZERO_ERROR;
4424 UParseError parseError;
4425 RuleBasedBreakIterator breakIterator(rules, parseError, status);
4426 if (status != U_BRK_RULE_SYNTAX) {
4427 errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status));
4428 }
4429 if (parseError.line != 1 || parseError.offset != 0) {
4430 errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset);
4431 }
4432 }
4433
4434
4435 void RBBITest::TestBug12797() {
4436 UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
4437 UErrorCode status = U_ZERO_ERROR;
4438 UParseError parseError;
4439 RuleBasedBreakIterator bi(rules, parseError, status);
4440 if (U_FAILURE(status)) {
4441 errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status));
4442 return;
4443 }
4444 UnicodeString text = "abc";
4445 bi.setText(text);
4446 bi.first();
4447 int32_t boundary = bi.next();
4448 if (boundary != 3) {
4449 errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary);
4450 }
4451 }
4452
4453 void RBBITest::TestBug12918() {
4454 // This test triggers an assertion failure in dictbe.cpp
4455 const UChar *crasherString = u"\u3325\u4a16";
4456 UErrorCode status = U_ZERO_ERROR;
4457 UBreakIterator* iter = ubrk_open(UBRK_WORD, NULL, crasherString, -1, &status);
4458 if (U_FAILURE(status)) {
4459 dataerrln("%s:%d status = %s", __FILE__, __LINE__, u_errorName(status));
4460 return;
4461 }
4462 ubrk_first(iter);
4463 int32_t pos = 0;
4464 int32_t lastPos = -1;
4465 while((pos = ubrk_next(iter)) != UBRK_DONE) {
4466 if (pos <= lastPos) {
4467 errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__, __LINE__, pos, lastPos);
4468 break;
4469 }
4470 }
4471 ubrk_close(iter);
4472 }
4473
4474 void RBBITest::TestBug12932() {
4475 // Node Stack overflow in the RBBI rule parser caused a seg fault.
4476 UnicodeString ruleStr(
4477 "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4478 "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4479 "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((()))"
4480 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4481 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4482 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))");
4483
4484 UErrorCode status = U_ZERO_ERROR;
4485 UParseError parseError;
4486 RuleBasedBreakIterator rbbi(ruleStr, parseError, status);
4487 if (status != U_BRK_RULE_SYNTAX) {
4488 errln("%s:%d expected U_BRK_RULE_SYNTAX, got %s",
4489 __FILE__, __LINE__, u_errorName(status));
4490 }
4491 }
4492
4493
4494 // Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt
4495 // remain undevided by ICU char, word and line break.
4496 void RBBITest::TestEmoji() {
4497 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4498 UErrorCode status = U_ZERO_ERROR;
4499
4500 CharString testFileName;
4501 testFileName.append(IntlTest::getSourceTestData(status), status);
4502 testFileName.appendPathPart("emoji-test.txt", status);
4503 if (U_FAILURE(status)) {
4504 errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4505 return;
4506 }
4507 logln("Opening data file %s\n", testFileName.data());
4508
4509 int len;
4510 UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
4511 if (U_FAILURE(status) || testFile == NULL) {
4512 errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4513 return;
4514 }
4515 UnicodeString testFileAsString(testFile, len);
4516 delete [] testFile;
4517
4518 RegexMatcher lineMatcher(u"^.*?$", testFileAsString, UREGEX_MULTILINE, status);
4519 RegexMatcher hexMatcher(u"\\s*([a-f0-9]*)", UREGEX_CASE_INSENSITIVE, status);
4520 // hexMatcher group(1) is a hex number, or empty string if no hex number present.
4521 int32_t lineNumber = 0;
4522
4523 LocalPointer<BreakIterator> charBreaks(BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4524 LocalPointer<BreakIterator> wordBreaks(BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4525 LocalPointer<BreakIterator> lineBreaks(BreakIterator::createLineInstance(Locale::getEnglish(), status), status);
4526 if (U_FAILURE(status)) {
4527 dataerrln("%s:%d %s while opening break iterators", __FILE__, __LINE__, u_errorName(status));
4528 return;
4529 }
4530
4531 while (lineMatcher.find()) {
4532 ++lineNumber;
4533 UnicodeString line = lineMatcher.group(status);
4534 hexMatcher.reset(line);
4535 UnicodeString testString; // accumulates the emoji sequence.
4536 while (hexMatcher.find() && hexMatcher.group(1, status).length() > 0) {
4537 UnicodeString hex = hexMatcher.group(1, status);
4538 if (hex.length() > 8) {
4539 errln("%s:%d emoji-test.txt:%d invalid code point %s", __FILE__, __LINE__, lineNumber, CStr(hex)());
4540 break;
4541 }
4542 CharString hex8;
4543 hex8.appendInvariantChars(hex, status);
4544 UChar32 c = (UChar32)strtol(hex8.data(), NULL, 16);
4545 if (c<=0x10ffff) {
4546 testString.append(c);
4547 } else {
4548 errln("%s:%d emoji-test.txt:%d Error: Unicode Character %s value out of range.",
4549 __FILE__, __LINE__, lineNumber, hex8.data());
4550 break;
4551 }
4552 }
4553
4554 if (testString.length() > 1) {
4555 charBreaks->setText(testString);
4556 charBreaks->first();
4557 int32_t firstBreak = charBreaks->next();
4558 if (testString.length() != firstBreak) {
4559 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4560 __FILE__, __LINE__, lineNumber, firstBreak);
4561 }
4562 wordBreaks->setText(testString);
4563 wordBreaks->first();
4564 firstBreak = wordBreaks->next();
4565 if (testString.length() != firstBreak) {
4566 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4567 __FILE__, __LINE__, lineNumber, firstBreak);
4568 }
4569 lineBreaks->setText(testString);
4570 lineBreaks->first();
4571 firstBreak = lineBreaks->next();
4572 if (testString.length() != firstBreak) {
4573 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4574 __FILE__, __LINE__, lineNumber, firstBreak);
4575 }
4576 }
4577 }
4578 #endif
4579 }
4580
4581
4582 // TestBug12519 - Correct handling of Locales by assignment / copy / clone
4583
4584 void RBBITest::TestBug12519() {
4585 UErrorCode status = U_ZERO_ERROR;
4586 LocalPointer<RuleBasedBreakIterator> biEn((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4587 LocalPointer<RuleBasedBreakIterator> biFr((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getFrance(), status));
4588 if (!assertSuccess(WHERE, status)) {
4589 dataerrln("%s %d status = %s", __FILE__, __LINE__, u_errorName(status));
4590 return;
4591 }
4592 assertTrue(WHERE, Locale::getEnglish() == biEn->getLocale(ULOC_VALID_LOCALE, status));
4593
4594 assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status));
4595 assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr);
4596
4597 LocalPointer<RuleBasedBreakIterator>cloneEn(biEn->clone());
4598 assertTrue(WHERE, *biEn == *cloneEn);
4599 assertTrue(WHERE, Locale::getEnglish() == cloneEn->getLocale(ULOC_VALID_LOCALE, status));
4600
4601 LocalPointer<RuleBasedBreakIterator>cloneFr(biFr->clone());
4602 assertTrue(WHERE, *biFr == *cloneFr);
4603 assertTrue(WHERE, Locale::getFrench() == cloneFr->getLocale(ULOC_VALID_LOCALE, status));
4604
4605 LocalPointer<RuleBasedBreakIterator>biDe((RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getGerman(), status));
4606 UnicodeString text("Hallo Welt");
4607 biDe->setText(text);
4608 assertTrue(WHERE "before assignment of \"biDe = biFr\", they should be different, but are equal.", *biFr != *biDe);
4609 *biDe = *biFr;
4610 assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe);
4611 }
4612
4613 void RBBITest::TestBug12677() {
4614 // Check that stripping of comments from rules for getRules() is not confused by
4615 // the presence of '#' characters in the rules that do not introduce comments.
4616 UnicodeString rules(u"!!forward; \n"
4617 "$x = [ab#]; # a set with a # literal. \n"
4618 " # .; # a comment that looks sort of like a rule. \n"
4619 " '#' '?'; # a rule with a quoted # \n"
4620 );
4621
4622 UErrorCode status = U_ZERO_ERROR;
4623 UParseError pe;
4624 RuleBasedBreakIterator bi(rules, pe, status);
4625 assertSuccess(WHERE, status);
4626 UnicodeString rtRules = bi.getRules();
4627 assertEquals(WHERE, UnicodeString(u"!!forward; $x = [ab#]; '#' '?'; "), rtRules);
4628 }
4629
4630
4631 void RBBITest::TestTableRedundancies() {
4632 UErrorCode status = U_ZERO_ERROR;
4633
4634 LocalPointer<RuleBasedBreakIterator> bi (
4635 (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status));
4636 assertSuccess(WHERE, status);
4637 if (U_FAILURE(status)) return;
4638
4639 RBBIDataWrapper *dw = bi->fData;
4640 const RBBIStateTable *fwtbl = dw->fForwardTable;
4641 int32_t numCharClasses = dw->fHeader->fCatCount;
4642 // printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates);
4643
4644 // Check for duplicate columns (character categories)
4645
4646 std::vector<UnicodeString> columns;
4647 for (int32_t column = 0; column < numCharClasses; column++) {
4648 UnicodeString s;
4649 for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) {
4650 RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4651 s.append(row->fNextState[column]);
4652 }
4653 columns.push_back(s);
4654 }
4655 // Ignore column (char class) 0 while checking; it's special, and may have duplicates.
4656 for (int c1=1; c1<numCharClasses; c1++) {
4657 for (int c2 = c1+1; c2 < numCharClasses; c2++) {
4658 if (columns.at(c1) == columns.at(c2)) {
4659 errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__, __LINE__, c1, c2);
4660 goto out;
4661 }
4662 }
4663 }
4664 out:
4665
4666 // Check for duplicate states
4667 std::vector<UnicodeString> rows;
4668 for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) {
4669 UnicodeString s;
4670 RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4671 assertTrue(WHERE, row->fAccepting >= -1);
4672 s.append(row->fAccepting + 1); // values of -1 are expected.
4673 s.append(row->fLookAhead);
4674 s.append(row->fTagIdx);
4675 for (int32_t column = 0; column < numCharClasses; column++) {
4676 s.append(row->fNextState[column]);
4677 }
4678 rows.push_back(s);
4679 }
4680 for (int r1=0; r1 < (int32_t)fwtbl->fNumStates; r1++) {
4681 for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) {
4682 if (rows.at(r1) == rows.at(r2)) {
4683 errln("%s:%d Duplicate rows (%d, %d)\n", __FILE__, __LINE__, r1, r2);
4684 return;
4685 }
4686 }
4687 }
4688 }
4689
4690 // Bug 13447: verify that getRuleStatus() returns the value corresponding to current(),
4691 // even after next() has returned DONE.
4692
4693 void RBBITest::TestBug13447() {
4694 UErrorCode status = U_ZERO_ERROR;
4695 LocalPointer<RuleBasedBreakIterator> bi(
4696 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4697 assertSuccess(WHERE, status);
4698 if (U_FAILURE(status)) return;
4699 UnicodeString data(u"1234");
4700 bi->setText(data);
4701 assertEquals(WHERE, UBRK_WORD_NONE, bi->getRuleStatus());
4702 assertEquals(WHERE, 4, bi->next());
4703 assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4704 assertEquals(WHERE, UBRK_DONE, bi->next());
4705 assertEquals(WHERE, 4, bi->current());
4706 assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4707 }
4708
4709 // TestReverse exercises both the synthesized safe reverse rules and the logic
4710 // for filling the break iterator cache when starting from random positions
4711 // in the text.
4712 //
4713 // It's a monkey test, working on random data, with the expected data obtained
4714 // from forward iteration (no safe rules involved), comparing with results
4715 // when indexing into the interior of the string (safe rules needed).
4716
4717 void RBBITest::TestReverse() {
4718 UErrorCode status = U_ZERO_ERROR;
4719
4720 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4721 BreakIterator::createCharacterInstance(Locale::getEnglish(), status)));
4722 assertSuccess(WHERE, status, true);
4723 status = U_ZERO_ERROR;
4724 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4725 BreakIterator::createWordInstance(Locale::getEnglish(), status)));
4726 assertSuccess(WHERE, status, true);
4727 status = U_ZERO_ERROR;
4728 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4729 BreakIterator::createLineInstance(Locale::getEnglish(), status)));
4730 assertSuccess(WHERE, status, true);
4731 status = U_ZERO_ERROR;
4732 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4733 BreakIterator::createSentenceInstance(Locale::getEnglish(), status)));
4734 assertSuccess(WHERE, status, true);
4735 }
4736
4737 void RBBITest::TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi) {
4738 if (!bi) {
4739 return;
4740 }
4741
4742 // From the mapping trie in the break iterator's internal data, create a
4743 // vector of UnicodeStrings, one for each character category, containing
4744 // all of the code points that map to that category. Unicode planes 0 and 1 only,
4745 // to avoid an execess of unassigned code points.
4746
4747 RBBIDataWrapper *data = bi->fData;
4748 int32_t categoryCount = data->fHeader->fCatCount;
4749 UTrie2 *trie = data->fTrie;
4750
4751 std::vector<UnicodeString> strings(categoryCount, UnicodeString());
4752 for (int cp=0; cp<0x1fff0; ++cp) {
4753 int cat = utrie2_get32(trie, cp);
4754 cat &= ~0x4000; // And off the dictionary bit from the category.
4755 assertTrue(WHERE, cat < categoryCount && cat >= 0);
4756 if (cat < 0 || cat >= categoryCount) return;
4757 strings[cat].append(cp);
4758 }
4759
4760 icu_rand randomGen;
4761 const int testStringLength = 10000;
4762 UnicodeString testString;
4763
4764 for (int i=0; i<testStringLength; ++i) {
4765 int charClass = randomGen() % categoryCount;
4766 if (strings[charClass].length() > 0) {
4767 int cp = strings[charClass].char32At(randomGen() % strings[charClass].length());
4768 testString.append(cp);
4769 }
4770 }
4771
4772 typedef std::pair<UBool, int32_t> Result;
4773 std::vector<Result> expectedResults;
4774 bi->setText(testString);
4775 for (int i=0; i<testString.length(); ++i) {
4776 bool isboundary = bi->isBoundary(i);
4777 int ruleStatus = bi->getRuleStatus();
4778 expectedResults.push_back(std::make_pair(isboundary, ruleStatus));
4779 }
4780
4781 for (int i=testString.length()-1; i>=0; --i) {
4782 bi->setText(testString); // clears the internal break cache
4783 Result expected = expectedResults[i];
4784 assertEquals(WHERE, expected.first, bi->isBoundary(i));
4785 assertEquals(WHERE, expected.second, bi->getRuleStatus());
4786 }
4787 }
4788
4789
4790 // Ticket 13692 - finding word boundaries in very large numbers or words could
4791 // be very time consuming. When the problem was present, this void test
4792 // would run more than fifteen minutes, which is to say, the failure was noticeale.
4793
4794 void RBBITest::TestBug13692() {
4795 UErrorCode status = U_ZERO_ERROR;
4796 LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4797 BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4798 if (!assertSuccess(WHERE, status, true)) {
4799 return;
4800 }
4801 constexpr int32_t LENGTH = 1000000;
4802 UnicodeString longNumber(LENGTH, (UChar32)u'3', LENGTH);
4803 for (int i=0; i<20; i+=2) {
4804 longNumber.setCharAt(i, u' ');
4805 }
4806 bi->setText(longNumber);
4807 assertFalse(WHERE, bi->isBoundary(LENGTH-5));
4808 assertSuccess(WHERE, status);
4809 }
4810
4811 //
4812 // TestDebug - A place-holder test for debugging purposes.
4813 // For putting in fragments of other tests that can be invoked
4814 // for tracing without a lot of unwanted extra stuff happening.
4815 //
4816 void RBBITest::TestDebug(void) {
4817 UErrorCode status = U_ZERO_ERROR;
4818 LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4819 BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4820 if (!assertSuccess(WHERE, status, true)) {
4821 return;
4822 }
4823 const UnicodeString &rules = bi->getRules();
4824 UParseError pe;
4825 LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
4826 assertSuccess(WHERE, status);
4827 }
4828
4829 void RBBITest::TestProperties() {
4830 UErrorCode errorCode = U_ZERO_ERROR;
4831 UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4832 if (!prependSet.isEmpty()) {
4833 errln(
4834 "[:GCB=Prepend:] is not empty any more. "
4835 "Uncomment relevant lines in source/data/brkitr/char.txt and "
4836 "change this test to the opposite condition.");
4837 }
4838 }
4839
4840 #endif // #if !UCONFIG_NO_BREAK_ITERATION