]> git.saurik.com Git - apple/icu.git/blame - icuSources/test/intltest/rbbitst.cpp
ICU-66108.tar.gz
[apple/icu.git] / icuSources / test / intltest / rbbitst.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
73c04bcf
A
3/********************************************************************
4 * COPYRIGHT:
2ca993e8 5 * Copyright (c) 1999-2016, International Business Machines Corporation and
73c04bcf
A
6 * others. All Rights Reserved.
7 ********************************************************************/
8/************************************************************************
9* Date Name Description
10* 12/15/99 Madhu Creation.
11* 01/12/2000 Madhu Updated for changed API and added new tests
12************************************************************************/
13
14#include "unicode/utypes.h"
73c04bcf
A
15#if !UCONFIG_NO_BREAK_ITERATION
16
2ca993e8
A
17#include <stdio.h>
18#include <stdlib.h>
19#include <string.h>
0f5d89e8
A
20#include <utility>
21#include <vector>
2ca993e8 22
73c04bcf 23#include "unicode/brkiter.h"
2ca993e8
A
24#include "unicode/localpointer.h"
25#include "unicode/numfmt.h"
73c04bcf 26#include "unicode/rbbi.h"
2ca993e8
A
27#if !UCONFIG_NO_REGULAR_EXPRESSIONS
28#include "unicode/regex.h"
29#endif
30#include "unicode/schriter.h"
73c04bcf
A
31#include "unicode/uchar.h"
32#include "unicode/utf16.h"
33#include "unicode/ucnv.h"
73c04bcf 34#include "unicode/uniset.h"
2ca993e8 35#include "unicode/uscript.h"
73c04bcf
A
36#include "unicode/ustring.h"
37#include "unicode/utext.h"
2ca993e8
A
38
39#include "charstr.h"
40#include "cmemory.h"
f3c0d7a5 41#include "cstr.h"
73c04bcf
A
42#include "intltest.h"
43#include "rbbitst.h"
0f5d89e8 44#include "rbbidata.h"
2ca993e8 45#include "utypeinfo.h" // for 'typeid' to work
73c04bcf
A
46#include "uvector.h"
47#include "uvectr32.h"
2ca993e8 48
340931cb
A
49// Needed for Apple perf tests <rdar://problem/51193810>
50#include <unistd.h>
51#include <mach/mach_time.h>
52
0f5d89e8 53
2ca993e8
A
54#if !UCONFIG_NO_FILTERED_BREAK_ITERATION
55#include "unicode/filteredbrk.h"
56#endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
73c04bcf 57
340931cb
A
58#define TEST_ASSERT(x) UPRV_BLOCK_MACRO_BEGIN { \
59 if (!(x)) { \
60 errln("Failure in file %s, line %d", __FILE__, __LINE__); \
61 } \
62} UPRV_BLOCK_MACRO_END
63
64#define TEST_ASSERT_SUCCESS(errcode) UPRV_BLOCK_MACRO_BEGIN { \
65 if (U_FAILURE(errcode)) { \
66 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode)); \
67 } \
68} UPRV_BLOCK_MACRO_END
69
70#define MONKEY_ERROR(msg, fRuleFileName, index, seed) { \
71 IntlTest::gTest->errln("\n%s:%d %s at index %d. Parameters to reproduce: @\"type=%s seed=%u loop=1\"", \
72 __FILE__, __LINE__, msg, index, fRuleFileName, seed); \
73}
73c04bcf 74
46f4442e
A
75//---------------------------------------------
76// runIndexedTest
77//---------------------------------------------
78
4388f060 79
2ca993e8 80// Note: Before adding new tests to this file, check whether the desired test data can
4388f060
A
81// simply be added to the file testdata/rbbitest.txt. In most cases it can,
82// it's much less work than writing a new test, diagnostic output in the event of failures
83// is good, and the test data file will is shared with ICU4J, so eventually the test
84// will run there as well, without additional effort.
85
46f4442e
A
86void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
87{
88 if (exec) logln("TestSuite RuleBasedBreakIterator: ");
f3c0d7a5 89 fTestParams = params;
46f4442e 90
f3c0d7a5 91 TESTCASE_AUTO_BEGIN;
729e4ab9 92#if !UCONFIG_NO_FILE_IO
f3c0d7a5 93 TESTCASE_AUTO(TestBug4153072);
729e4ab9 94#endif
729e4ab9 95#if !UCONFIG_NO_FILE_IO
f3c0d7a5 96 TESTCASE_AUTO(TestUnicodeFiles);
729e4ab9 97#endif
f3c0d7a5
A
98 TESTCASE_AUTO(TestGetAvailableLocales);
99 TESTCASE_AUTO(TestGetDisplayName);
729e4ab9 100#if !UCONFIG_NO_FILE_IO
f3c0d7a5
A
101 TESTCASE_AUTO(TestEndBehaviour);
102 TESTCASE_AUTO(TestWordBreaks);
103 TESTCASE_AUTO(TestWordBoundary);
104 TESTCASE_AUTO(TestLineBreaks);
105 TESTCASE_AUTO(TestSentBreaks);
106 TESTCASE_AUTO(TestExtended);
729e4ab9 107#endif
4388f060 108#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
f3c0d7a5 109 TESTCASE_AUTO(TestMonkey);
4388f060 110#endif
729e4ab9 111#if !UCONFIG_NO_FILE_IO
f3c0d7a5 112 TESTCASE_AUTO(TestBug3818);
729e4ab9 113#endif
f3c0d7a5 114 TESTCASE_AUTO(TestDebug);
729e4ab9 115#if !UCONFIG_NO_FILE_IO
f3c0d7a5 116 TESTCASE_AUTO(TestBug5775);
729e4ab9 117#endif
f3c0d7a5
A
118 TESTCASE_AUTO(TestBug9983);
119 TESTCASE_AUTO(TestDictRules);
120 TESTCASE_AUTO(TestBug5532);
121 TESTCASE_AUTO(TestBug7547);
122 TESTCASE_AUTO(TestBug12797);
123 TESTCASE_AUTO(TestBug12918);
124 TESTCASE_AUTO(TestBug12932);
125 TESTCASE_AUTO(TestEmoji);
0f5d89e8
A
126 TESTCASE_AUTO(TestBug12519);
127 TESTCASE_AUTO(TestBug12677);
128 TESTCASE_AUTO(TestTableRedundancies);
129 TESTCASE_AUTO(TestBug13447);
130 TESTCASE_AUTO(TestReverse);
131 TESTCASE_AUTO(TestBug13692);
f3c0d7a5 132 TESTCASE_AUTO_END;
46f4442e
A
133}
134
135
73c04bcf
A
136//--------------------------------------------------------------------------------------
137//
138// RBBITest constructor and destructor
139//
140//--------------------------------------------------------------------------------------
141
142RBBITest::RBBITest() {
f3c0d7a5 143 fTestParams = NULL;
73c04bcf
A
144}
145
146
147RBBITest::~RBBITest() {
73c04bcf
A
148}
149
73c04bcf 150
b331163b 151static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
73c04bcf
A
152 UErrorCode status = U_ZERO_ERROR;
153 char name[100];
154 printf("code alpha extend alphanum type word sent line name\n");
b331163b
A
155 int nextExpectedIndex = 0;
156 utext_setNativeIndex(tstr, 0);
3d1f044b 157 for (int j = 0; j < static_cast<int>(utext_nativeLength(tstr)); j=static_cast<int>(utext_getNativeIndex(tstr))) {
b331163b
A
158 if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
159 printf("------------------------------------------------ %d\n", j);
160 ++nextExpectedIndex;
73c04bcf 161 }
b331163b
A
162
163 UChar32 c = utext_next32(tstr);
73c04bcf
A
164 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
165 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
166 u_isUAlphabetic(c),
167 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
168 u_isalnum(c),
169 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
170 u_charType(c),
171 U_SHORT_PROPERTY_NAME),
172 u_getPropertyValueName(UCHAR_WORD_BREAK,
173 u_getIntPropertyValue(c,
174 UCHAR_WORD_BREAK),
175 U_SHORT_PROPERTY_NAME),
176 u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
177 u_getIntPropertyValue(c,
178 UCHAR_SENTENCE_BREAK),
179 U_SHORT_PROPERTY_NAME),
180 u_getPropertyValueName(UCHAR_LINE_BREAK,
181 u_getIntPropertyValue(c,
182 UCHAR_LINE_BREAK),
183 U_SHORT_PROPERTY_NAME),
184 name);
185 }
186}
187
73c04bcf 188
b331163b
A
189static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
190 UErrorCode status = U_ZERO_ERROR;
191 UText *tstr = NULL;
192 tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
193 if (U_FAILURE(status)) {
194 printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
195 return;
196 }
197 printStringBreaks(tstr, expected, expectedCount);
198 utext_close(tstr);
199}
200
201
73c04bcf
A
202void RBBITest::TestBug3818() {
203 UErrorCode status = U_ZERO_ERROR;
204
205 // Four Thai words...
206 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
207 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
208 UnicodeString thaiStr(thaiWordData);
209
57a6839d 210 BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
73c04bcf 211 if (U_FAILURE(status) || bi == NULL) {
729e4ab9 212 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
73c04bcf
A
213 return;
214 }
215 bi->setText(thaiStr);
216
217 int32_t startOfSecondWord = bi->following(1);
218 if (startOfSecondWord != 4) {
219 errln("Fail at file %s, line %d expected start of word at 4, got %d",
220 __FILE__, __LINE__, startOfSecondWord);
221 }
222 startOfSecondWord = bi->following(0);
223 if (startOfSecondWord != 4) {
224 errln("Fail at file %s, line %d expected start of word at 4, got %d",
225 __FILE__, __LINE__, startOfSecondWord);
226 }
227 delete bi;
228}
229
73c04bcf
A
230
231//---------------------------------------------
232//
233// other tests
234//
235//---------------------------------------------
73c04bcf
A
236
237void RBBITest::TestGetAvailableLocales()
238{
239 int32_t locCount = 0;
240 const Locale* locList = BreakIterator::getAvailableLocales(locCount);
241
242 if (locCount == 0)
729e4ab9 243 dataerrln("getAvailableLocales() returned an empty list!");
73c04bcf
A
244 // Just make sure that it's returning good memory.
245 int32_t i;
246 for (i = 0; i < locCount; ++i) {
247 logln(locList[i].getName());
248 }
249}
250
251//Testing the BreakIterator::getDisplayName() function
252void RBBITest::TestGetDisplayName()
253{
254 UnicodeString result;
255
256 BreakIterator::getDisplayName(Locale::getUS(), result);
257 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
729e4ab9 258 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
73c04bcf
A
259 + result);
260
261 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
262 if (result != "French (France)")
729e4ab9 263 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
73c04bcf
A
264 + result);
265}
266/**
267 * Test End Behaviour
268 * @bug 4068137
269 */
270void RBBITest::TestEndBehaviour()
271{
272 UErrorCode status = U_ZERO_ERROR;
273 UnicodeString testString("boo.");
274 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
275 if (U_FAILURE(status))
276 {
729e4ab9 277 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
73c04bcf
A
278 return;
279 }
280 wb->setText(testString);
281
282 if (wb->first() != 0)
283 errln("Didn't get break at beginning of string.");
284 if (wb->next() != 3)
285 errln("Didn't get break before period in \"boo.\"");
286 if (wb->current() != 4 && wb->next() != 4)
287 errln("Didn't get break at end of string.");
288 delete wb;
289}
290/*
291 * @bug 4153072
292 */
293void RBBITest::TestBug4153072() {
294 UErrorCode status = U_ZERO_ERROR;
295 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
296 if (U_FAILURE(status))
297 {
729e4ab9 298 errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
73c04bcf
A
299 return;
300 }
301 UnicodeString str("...Hello, World!...");
302 int32_t begin = 3;
303 int32_t end = str.length() - 3;
304 UBool onBoundary;
305
306 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
307 iter->adoptText(textIterator);
308 int index;
309 // Note: with the switch to UText, there is no way to restrict the
310 // iteration range to begin at an index other than zero.
311 // String character iterators created with a non-zero bound are
312 // treated by RBBI as being empty.
313 for (index = -1; index < begin + 1; ++index) {
314 onBoundary = iter->isBoundary(index);
315 if (index == 0? !onBoundary : onBoundary) {
316 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
317 " and begin index = " + begin);
318 }
319 }
320 delete iter;
321}
322
323
46f4442e
A
324//
325// Test for problem reported by Ashok Matoria on 9 July 2007
326// One.<kSoftHyphen><kSpace>Two.
327//
328// Sentence break at start (0) and then on calling next() it breaks at
329// 'T' of "Two". Now, at this point if I do next() and
330// then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
331//
332void RBBITest::TestBug5775() {
333 UErrorCode status = U_ZERO_ERROR;
334 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
335 TEST_ASSERT_SUCCESS(status);
729e4ab9
A
336 if (U_FAILURE(status)) {
337 return;
338 }
339// Check for status first for better handling of no data errors.
46f4442e 340 TEST_ASSERT(bi != NULL);
729e4ab9 341 if (bi == NULL) {
46f4442e
A
342 return;
343 }
2ca993e8 344
46f4442e
A
345 UnicodeString s("One.\\u00ad Two.", -1, US_INV);
346 // 01234 56789
347 s = s.unescape();
348 bi->setText(s);
349 int pos = bi->next();
350 TEST_ASSERT(pos == 6);
351 pos = bi->next();
352 TEST_ASSERT(pos == 10);
353 pos = bi->previous();
354 TEST_ASSERT(pos == 6);
355 delete bi;
356}
357
358
359
73c04bcf
A
360//------------------------------------------------------------------------------
361//
362// RBBITest::Extended Run RBBI Tests from an external test data file
363//
364//------------------------------------------------------------------------------
365
366struct TestParams {
b331163b
A
367 BreakIterator *bi; // Break iterator is set while parsing test source.
368 // Changed out whenever test data changes break type.
369
370 UnicodeString dataToBreak; // Data that is built up while parsing the test.
371 UVector32 *expectedBreaks; // Expected break positions, matches dataToBreak UnicodeString.
372 UVector32 *srcLine; // Positions in source file, indexed same as dataToBreak.
73c04bcf 373 UVector32 *srcCol;
b331163b
A
374
375 UText *textToBreak; // UText, could be UTF8 or UTF16.
376 UVector32 *textMap; // Map from UTF-16 dataToBreak offsets to UText offsets.
377 CharString utf8String; // UTF-8 form of text to break.
378
379 TestParams(UErrorCode &status) : dataToBreak() {
380 bi = NULL;
381 expectedBreaks = new UVector32(status);
382 srcLine = new UVector32(status);
383 srcCol = new UVector32(status);
384 textToBreak = NULL;
385 textMap = new UVector32(status);
386 }
387
388 ~TestParams() {
389 delete bi;
390 delete expectedBreaks;
391 delete srcLine;
392 delete srcCol;
393 utext_close(textToBreak);
394 delete textMap;
395 }
2ca993e8 396
b331163b
A
397 int32_t getSrcLine(int32_t bp);
398 int32_t getExpectedBreak(int32_t bp);
399 int32_t getSrcCol(int32_t bp);
400
401 void setUTF16(UErrorCode &status);
402 void setUTF8(UErrorCode &status);
73c04bcf
A
403};
404
b331163b
A
405// Append a UnicodeString to a CharString with UTF-8 encoding.
406// Substitute any invalid chars.
407// Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
408static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
409 if (U_FAILURE(status)) {
410 return;
411 }
412 int32_t utf8Length;
413 u_strToUTF8WithSub(NULL, 0, &utf8Length, // Output Buffer, NULL for preflight.
414 src.getBuffer(), src.length(), // UTF-16 data
415 0xfffd, NULL, // Substitution char, number of subs.
416 &status);
417 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
418 return;
419 }
420 status = U_ZERO_ERROR;
421 int32_t capacity;
422 char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
423 u_strToUTF8WithSub(buffer, utf8Length, NULL,
424 src.getBuffer(), src.length(),
425 0xfffd, NULL, &status);
426 dest.append(buffer, utf8Length, status);
427}
2ca993e8 428
b331163b
A
429
430void TestParams::setUTF16(UErrorCode &status) {
431 textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
432 textMap->removeAllElements();
433 for (int32_t i=0; i<dataToBreak.length(); i++) {
434 if (i == dataToBreak.getChar32Start(i)) {
435 textMap->addElement(i, status);
436 } else {
437 textMap->addElement(-1, status);
438 }
439 }
440 textMap->addElement(dataToBreak.length(), status);
441 U_ASSERT(dataToBreak.length() + 1 == textMap->size());
442}
443
444
445void TestParams::setUTF8(UErrorCode &status) {
446 if (U_FAILURE(status)) {
447 return;
448 }
449 utf8String.clear();
450 CharStringAppend(utf8String, dataToBreak, status);
451 textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
452 if (U_FAILURE(status)) {
453 return;
454 }
455
456 textMap->removeAllElements();
457 int32_t utf16Index = 0;
458 for (;;) {
459 textMap->addElement(utf16Index, status);
460 UChar32 c32 = utext_current32(textToBreak);
461 if (c32 < 0) {
462 break;
463 }
464 utf16Index += U16_LENGTH(c32);
465 utext_next32(textToBreak);
466 while (textMap->size() < utext_getNativeIndex(textToBreak)) {
467 textMap->addElement(-1, status);
468 }
469 }
470 U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
471}
472
473
f3c0d7a5 474int32_t TestParams::getSrcLine(int32_t bp) {
b331163b
A
475 if (bp >= textMap->size()) {
476 bp = textMap->size() - 1;
477 }
478 int32_t i = 0;
479 for(; bp >= 0 ; --bp) {
480 // Move to a character boundary if we are not on one already.
481 i = textMap->elementAti(bp);
482 if (i >= 0) {
483 break;
484 }
485 }
486 return srcLine->elementAti(i);
487}
488
489
f3c0d7a5 490int32_t TestParams::getExpectedBreak(int32_t bp) {
b331163b
A
491 if (bp >= textMap->size()) {
492 return 0;
493 }
494 int32_t i = textMap->elementAti(bp);
495 int32_t retVal = 0;
496 if (i >= 0) {
497 retVal = expectedBreaks->elementAti(i);
498 }
499 return retVal;
500}
501
502
f3c0d7a5 503int32_t TestParams::getSrcCol(int32_t bp) {
b331163b
A
504 if (bp >= textMap->size()) {
505 bp = textMap->size() - 1;
506 }
507 int32_t i = 0;
508 for(; bp >= 0; --bp) {
509 // Move bp to a character boundary if we are not on one already.
510 i = textMap->elementAti(bp);
511 if (i >= 0) {
512 break;
513 }
514 }
515 return srcCol->elementAti(i);
516}
517
518
519void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
73c04bcf
A
520 int32_t bp;
521 int32_t prevBP;
522 int32_t i;
523
b331163b
A
524 TEST_ASSERT_SUCCESS(status);
525 if (U_FAILURE(status)) {
526 return;
527 }
528
73c04bcf
A
529 if (t->bi == NULL) {
530 return;
531 }
532
b331163b 533 t->bi->setText(t->textToBreak, status);
73c04bcf
A
534 //
535 // Run the iterator forward
536 //
537 prevBP = -1;
538 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
539 if (prevBP == bp) {
540 // Fail for lack of forward progress.
541 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
b331163b 542 bp, t->getSrcLine(bp), t->getSrcCol(bp));
73c04bcf
A
543 break;
544 }
545
b331163b 546 // Check that there we didn't miss an expected break between the last one
73c04bcf
A
547 // and this one.
548 for (i=prevBP+1; i<bp; i++) {
b331163b 549 if (t->getExpectedBreak(i) != 0) {
73c04bcf
A
550 int expected[] = {0, i};
551 printStringBreaks(t->dataToBreak, expected, 2);
552 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
b331163b 553 i, t->getSrcLine(i), t->getSrcCol(i));
73c04bcf
A
554 }
555 }
556
557 // Check that the break we did find was expected
b331163b 558 if (t->getExpectedBreak(bp) == 0) {
73c04bcf 559 int expected[] = {0, bp};
b331163b 560 printStringBreaks(t->textToBreak, expected, 2);
73c04bcf 561 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
b331163b 562 bp, t->getSrcLine(bp), t->getSrcCol(bp));
73c04bcf
A
563 } else {
564 // The break was expected.
565 // Check that the {nnn} tag value is correct.
b331163b 566 int32_t expectedTagVal = t->getExpectedBreak(bp);
73c04bcf
A
567 if (expectedTagVal == -1) {
568 expectedTagVal = 0;
569 }
b331163b 570 int32_t line = t->getSrcLine(bp);
f3c0d7a5 571 int32_t rs = t->bi->getRuleStatus();
73c04bcf
A
572 if (rs != expectedTagVal) {
573 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
574 " Actual, Expected status = %4d, %4d",
b331163b 575 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
73c04bcf
A
576 }
577 }
578
73c04bcf
A
579 prevBP = bp;
580 }
581
582 // Verify that there were no missed expected breaks after the last one found
b331163b
A
583 for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
584 if (t->getExpectedBreak(i) != 0) {
73c04bcf 585 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
b331163b 586 i, t->getSrcLine(i), t->getSrcCol(i));
73c04bcf
A
587 }
588 }
589
590 //
591 // Run the iterator backwards, verify that the same breaks are found.
592 //
3d1f044b 593 prevBP = static_cast<int32_t>(utext_nativeLength(t->textToBreak) + 2); // start with a phony value for the last break pos seen.
0f5d89e8
A
594 bp = t->bi->last();
595 while (bp != BreakIterator::DONE) {
73c04bcf
A
596 if (prevBP == bp) {
597 // Fail for lack of progress.
598 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
b331163b 599 bp, t->getSrcLine(bp), t->getSrcCol(bp));
73c04bcf
A
600 break;
601 }
602
b331163b 603 // Check that we didn't miss an expected break between the last one
73c04bcf
A
604 // and this one. (UVector returns zeros for index out of bounds.)
605 for (i=prevBP-1; i>bp; i--) {
b331163b
A
606 if (t->getExpectedBreak(i) != 0) {
607 errln("Reverse Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
608 i, t->getSrcLine(i), t->getSrcCol(i));
73c04bcf
A
609 }
610 }
611
612 // Check that the break we did find was expected
b331163b 613 if (t->getExpectedBreak(bp) == 0) {
73c04bcf 614 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
b331163b 615 bp, t->getSrcLine(bp), t->getSrcCol(bp));
73c04bcf
A
616 } else {
617 // The break was expected.
618 // Check that the {nnn} tag value is correct.
b331163b 619 int32_t expectedTagVal = t->getExpectedBreak(bp);
73c04bcf
A
620 if (expectedTagVal == -1) {
621 expectedTagVal = 0;
622 }
b331163b
A
623 int line = t->getSrcLine(bp);
624 int32_t rs = t->bi->getRuleStatus();
73c04bcf
A
625 if (rs != expectedTagVal) {
626 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
627 " Actual, Expected status = %4d, %4d",
b331163b 628 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
73c04bcf
A
629 }
630 }
631
632 prevBP = bp;
0f5d89e8 633 bp = t->bi->previous();
73c04bcf
A
634 }
635
636 // Verify that there were no missed breaks prior to the last one found
637 for (i=prevBP-1; i>=0; i--) {
b331163b 638 if (t->getExpectedBreak(i) != 0) {
73c04bcf 639 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
b331163b 640 i, t->getSrcLine(i), t->getSrcCol(i));
73c04bcf
A
641 }
642 }
51004dcb
A
643
644 // Check isBoundary()
b331163b
A
645 for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
646 UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
51004dcb
A
647 UBool boundaryFound = t->bi->isBoundary(i);
648 if (boundaryExpected != boundaryFound) {
649 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
650 " Expected, Actual= %s, %s",
b331163b 651 i, t->getSrcLine(i), t->getSrcCol(i),
51004dcb
A
652 boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
653 }
654 }
655
656 // Check following()
3d1f044b 657 for (i=0; i < static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i++) {
51004dcb
A
658 int32_t actualBreak = t->bi->following(i);
659 int32_t expectedBreak = BreakIterator::DONE;
3d1f044b 660 for (int32_t j=i+1; j <= static_cast<int32_t>(utext_nativeLength(t->textToBreak)); j++) {
b331163b 661 if (t->getExpectedBreak(j) != 0) {
51004dcb
A
662 expectedBreak = j;
663 break;
664 }
665 }
666 if (expectedBreak != actualBreak) {
667 errln("following(%d) incorrect. File line,col= %4d,%4d\n"
668 " Expected, Actual= %d, %d",
b331163b 669 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
51004dcb
A
670 }
671 }
672
673 // Check preceding()
3d1f044b 674 for (i=static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i>=0; i--) {
51004dcb
A
675 int32_t actualBreak = t->bi->preceding(i);
676 int32_t expectedBreak = BreakIterator::DONE;
677
b331163b
A
678 // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
679 // preceding(trailing byte) will return the index of some preceding code point,
680 // not the lead byte of the current code point, even though that has a smaller index.
681 // Therefore, start looking at the expected break data not at i-1, but at
682 // the start of code point index - 1.
683 utext_setNativeIndex(t->textToBreak, i);
3d1f044b 684 int32_t j = static_cast<int32_t>(utext_getNativeIndex(t->textToBreak) - 1);
b331163b
A
685 for (; j >= 0; j--) {
686 if (t->getExpectedBreak(j) != 0) {
51004dcb
A
687 expectedBreak = j;
688 break;
689 }
690 }
691 if (expectedBreak != actualBreak) {
692 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
693 " Expected, Actual= %d, %d",
b331163b 694 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
51004dcb
A
695 }
696 }
73c04bcf
A
697}
698
699
700void RBBITest::TestExtended() {
f3c0d7a5
A
701 // Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This
702 // data driven test closely entangles filtered and regular data.
703#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION
73c04bcf
A
704 UErrorCode status = U_ZERO_ERROR;
705 Locale locale("");
706
b331163b 707 TestParams tp(status);
73c04bcf 708
0f5d89e8 709 RegexMatcher localeMatcher(UnicodeString(u"<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
729e4ab9
A
710 if (U_FAILURE(status)) {
711 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
712 }
73c04bcf 713
73c04bcf
A
714 //
715 // Open and read the test data file.
716 //
717 const char *testDataDirectory = IntlTest::getSourceTestData(status);
0f5d89e8
A
718 CharString testFileName(testDataDirectory, -1, status);
719 testFileName.append("rbbitst.txt", -1, status);
73c04bcf
A
720
721 int len;
0f5d89e8 722 UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
73c04bcf 723 if (U_FAILURE(status)) {
0f5d89e8
A
724 errln("%s:%d Error %s opening file rbbitst.txt", __FILE__, __LINE__, u_errorName(status));
725 return;
73c04bcf
A
726 }
727
2ca993e8 728 bool skipTest = false; // Skip this test?
46f4442e 729
73c04bcf
A
730 //
731 // Put the test data into a UnicodeString
732 //
733 UnicodeString testString(FALSE, testFile, len);
734
735 enum EParseState{
736 PARSE_COMMENT,
737 PARSE_TAG,
738 PARSE_DATA,
0f5d89e8
A
739 PARSE_NUM,
740 PARSE_RULES
73c04bcf
A
741 }
742 parseState = PARSE_TAG;
743
744 EParseState savedState = PARSE_TAG;
745
73c04bcf
A
746 int32_t lineNum = 1;
747 int32_t colStart = 0;
748 int32_t column = 0;
749 int32_t charIdx = 0;
750
0f5d89e8
A
751 int32_t tagValue = 0; // The numeric value of a <nnn> tag.
752
753 UnicodeString rules; // Holds rules from a <rules> ... </rules> block
340931cb
A
754 int32_t rulesFirstLine = 0; // Line number of the start of current <rules> block
755
756 // <rdar://problem/51193810>
757 mach_timebase_info_data_t info;
758 uint64_t start, durationOpen = 0.0, durationUse = 0.0;
759 mach_timebase_info(&info);
760 UBool isLine = FALSE;
73c04bcf
A
761
762 for (charIdx = 0; charIdx < len; ) {
763 status = U_ZERO_ERROR;
764 UChar c = testString.charAt(charIdx);
765 charIdx++;
f3c0d7a5 766 if (c == u'\r' && charIdx<len && testString.charAt(charIdx) == u'\n') {
73c04bcf 767 // treat CRLF as a unit
f3c0d7a5 768 c = u'\n';
73c04bcf
A
769 charIdx++;
770 }
f3c0d7a5 771 if (c == u'\n' || c == u'\r') {
73c04bcf
A
772 lineNum++;
773 colStart = charIdx;
774 }
775 column = charIdx - colStart + 1;
776
777 switch (parseState) {
778 case PARSE_COMMENT:
f3c0d7a5 779 if (c == u'\n' || c == u'\r') {
73c04bcf
A
780 parseState = savedState;
781 }
782 break;
783
784 case PARSE_TAG:
785 {
f3c0d7a5 786 if (c == u'#') {
73c04bcf
A
787 parseState = PARSE_COMMENT;
788 savedState = PARSE_TAG;
789 break;
790 }
791 if (u_isUWhiteSpace(c)) {
792 break;
793 }
0f5d89e8 794 if (testString.compare(charIdx-1, 6, u"<word>") == 0) {
73c04bcf
A
795 delete tp.bi;
796 tp.bi = BreakIterator::createWordInstance(locale, status);
2ca993e8 797 skipTest = false;
73c04bcf 798 charIdx += 5;
340931cb 799 isLine = FALSE;
73c04bcf
A
800 break;
801 }
0f5d89e8 802 if (testString.compare(charIdx-1, 6, u"<char>") == 0) {
73c04bcf
A
803 delete tp.bi;
804 tp.bi = BreakIterator::createCharacterInstance(locale, status);
2ca993e8 805 skipTest = false;
73c04bcf 806 charIdx += 5;
340931cb 807 isLine = FALSE;
73c04bcf
A
808 break;
809 }
0f5d89e8 810 if (testString.compare(charIdx-1, 6, u"<line>") == 0) {
73c04bcf 811 delete tp.bi;
340931cb 812 start = mach_absolute_time(); // <rdar://problem/51193810>
73c04bcf 813 tp.bi = BreakIterator::createLineInstance(locale, status);
340931cb 814 durationOpen += (((mach_absolute_time() - start) * info.numer)/info.denom);
2ca993e8 815 skipTest = false;
73c04bcf 816 charIdx += 5;
340931cb 817 isLine = TRUE;
73c04bcf
A
818 break;
819 }
0f5d89e8 820 if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
73c04bcf 821 delete tp.bi;
46f4442e 822 tp.bi = BreakIterator::createSentenceInstance(locale, status);
2ca993e8 823 skipTest = false;
73c04bcf
A
824 charIdx += 5;
825 break;
826 }
0f5d89e8 827 if (testString.compare(charIdx-1, 7, u"<title>") == 0) {
73c04bcf
A
828 delete tp.bi;
829 tp.bi = BreakIterator::createTitleInstance(locale, status);
830 charIdx += 6;
340931cb 831 isLine = FALSE;
73c04bcf
A
832 break;
833 }
46f4442e 834
0f5d89e8
A
835 if (testString.compare(charIdx-1, 7, u"<rules>") == 0 ||
836 testString.compare(charIdx-1, 10, u"<badrules>") == 0) {
837 charIdx = testString.indexOf(u'>', charIdx) + 1;
838 parseState = PARSE_RULES;
839 rules.remove();
840 rulesFirstLine = lineNum;
340931cb 841 isLine = FALSE;
0f5d89e8
A
842 break;
843 }
844
73c04bcf
A
845 // <locale loc_name>
846 localeMatcher.reset(testString);
847 if (localeMatcher.lookingAt(charIdx-1, status)) {
848 UnicodeString localeName = localeMatcher.group(1, status);
849 char localeName8[100];
850 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
851 locale = Locale::createFromName(localeName8);
51004dcb 852 charIdx += localeMatcher.group(0, status).length() - 1;
73c04bcf
A
853 TEST_ASSERT_SUCCESS(status);
854 break;
855 }
0f5d89e8 856 if (testString.compare(charIdx-1, 6, u"<data>") == 0) {
73c04bcf
A
857 parseState = PARSE_DATA;
858 charIdx += 5;
859 tp.dataToBreak = "";
860 tp.expectedBreaks->removeAllElements();
861 tp.srcCol ->removeAllElements();
862 tp.srcLine->removeAllElements();
863 break;
864 }
865
866 errln("line %d: Tag expected in test file.", lineNum);
73c04bcf
A
867 parseState = PARSE_COMMENT;
868 savedState = PARSE_DATA;
46f4442e 869 goto end_test; // Stop the test.
73c04bcf
A
870 }
871 break;
872
0f5d89e8
A
873 case PARSE_RULES:
874 if (testString.compare(charIdx-1, 8, u"</rules>") == 0) {
875 charIdx += 7;
876 parseState = PARSE_TAG;
877 delete tp.bi;
878 UParseError pe;
879 tp.bi = new RuleBasedBreakIterator(rules, pe, status);
880 skipTest = U_FAILURE(status);
881 if (U_FAILURE(status)) {
882 errln("file rbbitst.txt: %d - Error %s creating break iterator from rules.",
883 rulesFirstLine + pe.line - 1, u_errorName(status));
884 }
885 } else if (testString.compare(charIdx-1, 11, u"</badrules>") == 0) {
886 charIdx += 10;
887 parseState = PARSE_TAG;
888 UErrorCode ec = U_ZERO_ERROR;
889 UParseError pe;
890 RuleBasedBreakIterator bi(rules, pe, ec);
891 if (U_SUCCESS(ec)) {
892 errln("file rbbitst.txt: %d - Expected, but did not get, a failure creating break iterator from rules.",
893 rulesFirstLine + pe.line - 1);
894 }
895 } else {
896 rules.append(c);
897 }
898 break;
899
73c04bcf 900 case PARSE_DATA:
f3c0d7a5 901 if (c == u'\u2022') { // u'•'
73c04bcf
A
902 int32_t breakIdx = tp.dataToBreak.length();
903 tp.expectedBreaks->setSize(breakIdx+1);
904 tp.expectedBreaks->setElementAt(-1, breakIdx);
905 tp.srcLine->setSize(breakIdx+1);
906 tp.srcLine->setElementAt(lineNum, breakIdx);
907 tp.srcCol ->setSize(breakIdx+1);
908 tp.srcCol ->setElementAt(column, breakIdx);
909 break;
910 }
911
0f5d89e8 912 if (testString.compare(charIdx-1, 7, u"</data>") == 0) {
73c04bcf
A
913 // Add final entry to mappings from break location to source file position.
914 // Need one extra because last break position returned is after the
915 // last char in the data, not at the last char.
916 tp.srcLine->addElement(lineNum, status);
917 tp.srcCol ->addElement(column, status);
918
919 parseState = PARSE_TAG;
920 charIdx += 6;
921
2ca993e8
A
922 if (!skipTest) {
923 // RUN THE TEST!
924 status = U_ZERO_ERROR;
925 tp.setUTF16(status);
340931cb 926 start = mach_absolute_time(); // <rdar://problem/51193810>
2ca993e8 927 executeTest(&tp, status);
340931cb
A
928 if (isLine) {
929 durationUse += (((mach_absolute_time() - start) * info.numer)/info.denom);
930 }
2ca993e8
A
931 TEST_ASSERT_SUCCESS(status);
932
933 // Run again, this time with UTF-8 text wrapped in a UText.
934 status = U_ZERO_ERROR;
935 tp.setUTF8(status);
936 TEST_ASSERT_SUCCESS(status);
937 executeTest(&tp, status);
938 }
73c04bcf
A
939 break;
940 }
941
0f5d89e8 942 if (testString.compare(charIdx-1, 3, u"\\N{") == 0) {
73c04bcf
A
943 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
944 // Get the code point from the name and insert it into the test data.
945 // (Damn, no API takes names in Unicode !!!
946 // we've got to take it back to char *)
f3c0d7a5 947 int32_t nameEndIdx = testString.indexOf(u'}', charIdx);
73c04bcf
A
948 int32_t nameLength = nameEndIdx - (charIdx+2);
949 char charNameBuf[200];
950 UChar32 theChar = -1;
951 if (nameEndIdx != -1) {
952 UErrorCode status = U_ZERO_ERROR;
953 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
954 charNameBuf[sizeof(charNameBuf)-1] = 0;
955 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
956 if (U_FAILURE(status)) {
957 theChar = -1;
958 }
959 }
960 if (theChar == -1) {
961 errln("Error in named character in test file at line %d, col %d",
962 lineNum, column);
963 } else {
964 // Named code point was recognized. Insert it
965 // into the test data.
966 tp.dataToBreak.append(theChar);
967 while (tp.dataToBreak.length() > tp.srcLine->size()) {
968 tp.srcLine->addElement(lineNum, status);
969 tp.srcCol ->addElement(column, status);
970 }
971 }
972 if (nameEndIdx > charIdx) {
973 charIdx = nameEndIdx+1;
974
975 }
976 break;
977 }
978
979
980
0f5d89e8 981 if (testString.compare(charIdx-1, 2, u"<>") == 0) {
73c04bcf
A
982 charIdx++;
983 int32_t breakIdx = tp.dataToBreak.length();
984 tp.expectedBreaks->setSize(breakIdx+1);
985 tp.expectedBreaks->setElementAt(-1, breakIdx);
986 tp.srcLine->setSize(breakIdx+1);
987 tp.srcLine->setElementAt(lineNum, breakIdx);
988 tp.srcCol ->setSize(breakIdx+1);
989 tp.srcCol ->setElementAt(column, breakIdx);
990 break;
991 }
992
f3c0d7a5 993 if (c == u'<') {
73c04bcf
A
994 tagValue = 0;
995 parseState = PARSE_NUM;
996 break;
997 }
998
f3c0d7a5 999 if (c == u'#' && column==3) { // TODO: why is column off so far?
73c04bcf
A
1000 parseState = PARSE_COMMENT;
1001 savedState = PARSE_DATA;
1002 break;
1003 }
1004
f3c0d7a5 1005 if (c == u'\\') {
73c04bcf
A
1006 // Check for \ at end of line, a line continuation.
1007 // Advance over (discard) the newline
1008 UChar32 cp = testString.char32At(charIdx);
f3c0d7a5 1009 if (cp == u'\r' && charIdx<len && testString.charAt(charIdx+1) == u'\n') {
73c04bcf
A
1010 // We have a CR LF
1011 // Need an extra increment of the input ptr to move over both of them
1012 charIdx++;
1013 }
f3c0d7a5 1014 if (cp == u'\n' || cp == u'\r') {
73c04bcf
A
1015 lineNum++;
1016 colStart = charIdx;
1017 charIdx++;
1018 break;
1019 }
1020
1021 // Let unescape handle the back slash.
1022 cp = testString.unescapeAt(charIdx);
1023 if (cp != -1) {
1024 // Escape sequence was recognized. Insert the char
1025 // into the test data.
1026 tp.dataToBreak.append(cp);
1027 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1028 tp.srcLine->addElement(lineNum, status);
1029 tp.srcCol ->addElement(column, status);
1030 }
1031 break;
1032 }
1033
1034
1035 // Not a recognized backslash escape sequence.
1036 // Take the next char as a literal.
1037 // TODO: Should this be an error?
1038 c = testString.charAt(charIdx);
1039 charIdx = testString.moveIndex32(charIdx, 1);
1040 }
1041
1042 // Normal, non-escaped data char.
1043 tp.dataToBreak.append(c);
1044
1045 // Save the mapping from offset in the data to line/column numbers in
1046 // the original input file. Will be used for better error messages only.
1047 // If there's an expected break before this char, the slot in the mapping
1048 // vector will already be set for this char; don't overwrite it.
1049 if (tp.dataToBreak.length() > tp.srcLine->size()) {
1050 tp.srcLine->addElement(lineNum, status);
1051 tp.srcCol ->addElement(column, status);
1052 }
1053 break;
1054
1055
1056 case PARSE_NUM:
1057 // We are parsing an expected numeric tag value, like <1234>,
1058 // within a chunk of data.
1059 if (u_isUWhiteSpace(c)) {
1060 break;
1061 }
1062
f3c0d7a5 1063 if (c == u'>') {
73c04bcf
A
1064 // Finished the number. Add the info to the expected break data,
1065 // and switch parse state back to doing plain data.
1066 parseState = PARSE_DATA;
1067 if (tagValue == 0) {
1068 tagValue = -1;
1069 }
1070 int32_t breakIdx = tp.dataToBreak.length();
1071 tp.expectedBreaks->setSize(breakIdx+1);
1072 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1073 tp.srcLine->setSize(breakIdx+1);
1074 tp.srcLine->setElementAt(lineNum, breakIdx);
1075 tp.srcCol ->setSize(breakIdx+1);
1076 tp.srcCol ->setElementAt(column, breakIdx);
1077 break;
1078 }
1079
1080 if (u_isdigit(c)) {
1081 tagValue = tagValue*10 + u_charDigitValue(c);
1082 break;
1083 }
1084
1085 errln("Syntax Error in test file at line %d, col %d",
1086 lineNum, column);
73c04bcf 1087 parseState = PARSE_COMMENT;
46f4442e 1088 goto end_test; // Stop the test
73c04bcf
A
1089 break;
1090 }
1091
1092
1093 if (U_FAILURE(status)) {
4388f060 1094 dataerrln("ICU Error %s while parsing test file at line %d.",
73c04bcf 1095 u_errorName(status), lineNum);
73c04bcf 1096 status = U_ZERO_ERROR;
46f4442e 1097 goto end_test; // Stop the test
73c04bcf
A
1098 }
1099
1100 }
1101
0f5d89e8
A
1102 // Reached end of test file. Raise an error if parseState indicates that we are
1103 // within a block that should have been terminated.
1104
1105 if (parseState == PARSE_RULES) {
1106 errln("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
1107 lineNum, rulesFirstLine);
1108 }
1109 if (parseState == PARSE_DATA) {
1110 errln("rbbitst.txt:%d <data> block not closed.", lineNum);
1111 }
1112
340931cb
A
1113 //
1114 infoln("TestExtended total time in createLineInstance (nsec):\t%llu\n", durationOpen);
1115 infoln("TestExtended total time in linebreak test execute (nsec):\t%llu\n", durationUse);
1116
0f5d89e8 1117
73c04bcf 1118end_test:
73c04bcf
A
1119 delete [] testFile;
1120#endif
1121}
1122
729e4ab9
A
1123
1124//-------------------------------------------------------------------------------
1125//
1126// TestDictRules create a break iterator from source rules that includes a
1127// dictionary range. Regression for bug #7130. Source rules
1128// do not declare a break iterator type (word, line, sentence, etc.
1129// but the dictionary code, without a type, would loop.
1130//
1131//-------------------------------------------------------------------------------
1132void RBBITest::TestDictRules() {
1133 const char *rules = "$dictionary = [a-z]; \n"
1134 "!!forward; \n"
1135 "$dictionary $dictionary; \n"
1136 "!!reverse; \n"
1137 "$dictionary $dictionary; \n";
1138 const char *text = "aa";
1139 UErrorCode status = U_ZERO_ERROR;
1140 UParseError parseError;
1141
1142 RuleBasedBreakIterator bi(rules, parseError, status);
1143 if (U_SUCCESS(status)) {
1144 UnicodeString utext = text;
1145 bi.setText(utext);
1146 int32_t position;
1147 int32_t loops;
1148 for (loops = 0; loops<10; loops++) {
1149 position = bi.next();
1150 if (position == RuleBasedBreakIterator::DONE) {
1151 break;
1152 }
1153 }
1154 TEST_ASSERT(loops == 1);
1155 } else {
1156 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1157 }
1158}
1159
1160
73c04bcf
A
1161
1162//-------------------------------------------------------------------------------
1163//
1164// ReadAndConvertFile Read a text data file, convert it to UChars, and
2ca993e8 1165// return the data in one big UChar * buffer, which the caller must delete.
73c04bcf 1166//
46f4442e
A
1167// parameters:
1168// fileName: the name of the file, with no directory part. The test data directory
1169// is assumed.
1170// ulen an out parameter, receives the actual length (in UChars) of the file data.
1171// encoding The file encoding. If the file contains a BOM, that will override the encoding
1172// specified here. The BOM, if it exists, will be stripped from the returned data.
1173// Pass NULL for the system default encoding.
1174// status
1175// returns:
1176// The file data, converted to UChar.
1177// The caller must delete this when done with
1178// delete [] theBuffer;
1179//
73c04bcf
A
1180// TODO: This is a clone of RegexTest::ReadAndConvertFile.
1181// Move this function to some common place.
1182//
1183//--------------------------------------------------------------------------------
46f4442e 1184UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
73c04bcf
A
1185 UChar *retPtr = NULL;
1186 char *fileBuf = NULL;
1187 UConverter* conv = NULL;
1188 FILE *f = NULL;
1189
1190 ulen = 0;
1191 if (U_FAILURE(status)) {
1192 return retPtr;
1193 }
1194
1195 //
1196 // Open the file.
1197 //
1198 f = fopen(fileName, "rb");
1199 if (f == 0) {
729e4ab9 1200 dataerrln("Error opening test data file %s\n", fileName);
73c04bcf
A
1201 status = U_FILE_ACCESS_ERROR;
1202 return NULL;
1203 }
1204 //
1205 // Read it in
1206 //
1207 int fileSize;
1208 int amt_read;
1209
1210 fseek( f, 0, SEEK_END);
1211 fileSize = ftell(f);
1212 fileBuf = new char[fileSize];
1213 fseek(f, 0, SEEK_SET);
3d1f044b 1214 amt_read = static_cast<int>(fread(fileBuf, 1, fileSize, f));
73c04bcf
A
1215 if (amt_read != fileSize || fileSize <= 0) {
1216 errln("Error reading test data file.");
1217 goto cleanUpAndReturn;
1218 }
1219
1220 //
1221 // Look for a Unicode Signature (BOM) on the data just read
1222 //
1223 int32_t signatureLength;
1224 const char * fileBufC;
46f4442e 1225 const char* bomEncoding;
73c04bcf
A
1226
1227 fileBufC = fileBuf;
46f4442e 1228 bomEncoding = ucnv_detectUnicodeSignature(
73c04bcf 1229 fileBuf, fileSize, &signatureLength, &status);
46f4442e 1230 if(bomEncoding!=NULL ){
73c04bcf
A
1231 fileBufC += signatureLength;
1232 fileSize -= signatureLength;
46f4442e 1233 encoding = bomEncoding;
73c04bcf
A
1234 }
1235
1236 //
1237 // Open a converter to take the rule file to UTF-16
1238 //
1239 conv = ucnv_open(encoding, &status);
1240 if (U_FAILURE(status)) {
1241 goto cleanUpAndReturn;
1242 }
1243
1244 //
1245 // Convert the rules to UChar.
1246 // Preflight first to determine required buffer size.
1247 //
1248 ulen = ucnv_toUChars(conv,
1249 NULL, // dest,
1250 0, // destCapacity,
1251 fileBufC,
1252 fileSize,
1253 &status);
1254 if (status == U_BUFFER_OVERFLOW_ERROR) {
1255 // Buffer Overflow is expected from the preflight operation.
1256 status = U_ZERO_ERROR;
1257
1258 retPtr = new UChar[ulen+1];
1259 ucnv_toUChars(conv,
1260 retPtr, // dest,
1261 ulen+1,
1262 fileBufC,
1263 fileSize,
1264 &status);
1265 }
1266
1267cleanUpAndReturn:
1268 fclose(f);
1269 delete []fileBuf;
1270 ucnv_close(conv);
1271 if (U_FAILURE(status)) {
1272 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4388f060 1273 delete []retPtr;
73c04bcf
A
1274 retPtr = 0;
1275 ulen = 0;
340931cb 1276 }
73c04bcf
A
1277 return retPtr;
1278}
1279
1280
73c04bcf 1281
46f4442e 1282//--------------------------------------------------------------------------------------------
73c04bcf 1283//
46f4442e 1284// Run tests from each of the boundary test data files distributed by the Unicode Consortium
73c04bcf 1285//
46f4442e
A
1286//-------------------------------------------------------------------------------------------
1287void RBBITest::TestUnicodeFiles() {
1288 RuleBasedBreakIterator *bi;
1289 UErrorCode status = U_ZERO_ERROR;
73c04bcf 1290
729e4ab9 1291 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
46f4442e
A
1292 TEST_ASSERT_SUCCESS(status);
1293 if (U_SUCCESS(status)) {
1294 runUnicodeTestData("GraphemeBreakTest.txt", bi);
1295 }
1296 delete bi;
73c04bcf 1297
729e4ab9 1298 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
46f4442e
A
1299 TEST_ASSERT_SUCCESS(status);
1300 if (U_SUCCESS(status)) {
1301 runUnicodeTestData("WordBreakTest.txt", bi);
1302 }
1303 delete bi;
73c04bcf 1304
729e4ab9 1305 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
46f4442e
A
1306 TEST_ASSERT_SUCCESS(status);
1307 if (U_SUCCESS(status)) {
1308 runUnicodeTestData("SentenceBreakTest.txt", bi);
1309 }
1310 delete bi;
73c04bcf 1311
729e4ab9 1312 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
46f4442e
A
1313 TEST_ASSERT_SUCCESS(status);
1314 if (U_SUCCESS(status)) {
1315 runUnicodeTestData("LineBreakTest.txt", bi);
73c04bcf 1316 }
46f4442e 1317 delete bi;
73c04bcf
A
1318}
1319
1320
b331163b 1321// Check for test cases from the Unicode test data files that are known to fail
3d1f044b
A
1322// and should be skipped as known issues because ICU does not fully implement
1323// the Unicode specifications, or because ICU includes tailorings that differ from
1324// the Unicode standard.
1325//
1326// Test cases are identified by the test data sequence, which tends to be more stable
1327// across Unicode versions than the test file line numbers.
1328//
1329// The test case with ticket "10666" is a dummy, included as an example.
b331163b
A
1330
1331UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
f3c0d7a5 1332 static struct TestCase {
3d1f044b 1333 const char *fTicketNum;
f3c0d7a5
A
1334 const char *fFileName;
1335 const UChar *fString;
3d1f044b
A
1336 } badTestCases[] = {
1337 {"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"}, // Fake example, for illustration.
1338 // Issue 8151, move the Finnish tailoring of the line break of hyphens to root.
1339 // This probably ultimately wants to be resolved by updating UAX-14, but in the mean time
1340 // ICU is out of sync with Unicode.
1341 {"8151", "LineBreakTest.txt", u"-#"},
1342 {"8151", "LineBreakTest.txt", u"\u002d\u0308\u0023"},
1343 {"8151", "LineBreakTest.txt", u"\u002d\u00a7"},
1344 {"8151", "LineBreakTest.txt", u"\u002d\u0308\u00a7"},
1345 {"8151", "LineBreakTest.txt", u"\u002d\U00050005"},
1346 {"8151", "LineBreakTest.txt", u"\u002d\u0308\U00050005"},
1347 {"8151", "LineBreakTest.txt", u"\u002d\u0e01"},
1348 {"8151", "LineBreakTest.txt", u"\u002d\u0308\u0e01"},
1349
1350 // Issue ICU-12017 Improve line break around numbers
1351 {"12017", "LineBreakTest.txt", u"\u002C\u0030"}, // ",0"
1352 {"12017", "LineBreakTest.txt", u"\u002C\u0308\u0030"},
1353 {"12017", "LineBreakTest.txt", u"find .com"},
1354 {"12017", "LineBreakTest.txt", u"equals .35 cents"},
1355 {"12017", "LineBreakTest.txt", u"a.2 "},
1356 {"12017", "LineBreakTest.txt", u"a.2 \u0915"},
1357 {"12017", "LineBreakTest.txt", u"a.2 \u672C"},
1358 {"12017", "LineBreakTest.txt", u"a.2\u3000\u672C"},
1359 {"12017", "LineBreakTest.txt", u"a.2\u3000\u307E"},
1360 {"12017", "LineBreakTest.txt", u"a.2\u3000\u0033"},
1361 {"12017", "LineBreakTest.txt", u"A.1 \uBABB"},
1362 {"12017", "LineBreakTest.txt", u"\uBD24\uC5B4\u002E\u0020\u0041\u002E\u0032\u0020\uBCFC"},
1363 {"12017", "LineBreakTest.txt", u"\uBD10\uC694\u002E\u0020\u0041\u002E\u0033\u0020\uBABB"},
1364 {"12017", "LineBreakTest.txt", u"\uC694\u002E\u0020\u0041\u002E\u0034\u0020\uBABB"},
1365 {"12017", "LineBreakTest.txt", u"a.2\u3000\u300C"},
b331163b 1366 };
b331163b 1367
f3c0d7a5
A
1368 for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
1369 const TestCase &badCase = badTestCases[n];
1370 if (!strcmp(fileName, badCase.fFileName) &&
1371 testCase == UnicodeString(badCase.fString)) {
3d1f044b 1372 return logKnownIssue(badCase.fTicketNum);
b331163b
A
1373 }
1374 }
1375 return FALSE;
1376}
1377
1378
46f4442e
A
1379//--------------------------------------------------------------------------------------------
1380//
1381// Run tests from one of the boundary test data files distributed by the Unicode Consortium
1382//
1383//-------------------------------------------------------------------------------------------
1384void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1385#if !UCONFIG_NO_REGULAR_EXPRESSIONS
1386 UErrorCode status = U_ZERO_ERROR;
73c04bcf 1387
46f4442e
A
1388 //
1389 // Open and read the test data file, put it into a UnicodeString.
1390 //
1391 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1392 char testFileName[1000];
1393 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
729e4ab9 1394 dataerrln("Can't open test data. Path too long.");
73c04bcf
A
1395 return;
1396 }
46f4442e
A
1397 strcpy(testFileName, testDataDirectory);
1398 strcat(testFileName, fileName);
2ca993e8 1399
46f4442e 1400 logln("Opening data file %s\n", fileName);
73c04bcf 1401
46f4442e
A
1402 int len;
1403 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1404 if (status != U_FILE_ACCESS_ERROR) {
1405 TEST_ASSERT_SUCCESS(status);
1406 TEST_ASSERT(testFile != NULL);
1407 }
1408 if (U_FAILURE(status) || testFile == NULL) {
1409 return; /* something went wrong, error already output */
1410 }
1411 UnicodeString testFileAsString(TRUE, testFile, len);
73c04bcf 1412
46f4442e
A
1413 //
1414 // Parse the test data file using a regular expression.
1415 // Each kind of token is recognized in its own capture group; what type of item was scanned
1416 // is identified by which group had a match.
1417 //
1418 // Caputure Group # 1 2 3 4 5
1419 // Parses this item: divide x hex digits comment \n unrecognized \n
1420 //
1421 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1422 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1423 UnicodeString testString;
1424 UVector32 breakPositions(status);
1425 int lineNumber = 1;
1426 TEST_ASSERT_SUCCESS(status);
1427 if (U_FAILURE(status)) {
73c04bcf
A
1428 return;
1429 }
1430
46f4442e
A
1431 //
1432 // Scan through each test case, building up the string to be broken in testString,
1433 // and the positions that should be boundaries in the breakPositions vector.
1434 //
729e4ab9 1435 int spin = 0;
46f4442e 1436 while (tokenMatcher.find()) {
340931cb 1437 if(tokenMatcher.hitEnd()) {
729e4ab9
A
1438 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for.
1439 This occurred when the text file was corrupt (wasn't marked as UTF-8)
1440 and caused an infinite loop here on EBCDIC systems!
1441 */
1442 fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
340931cb
A
1443 // return;
1444 }
46f4442e
A
1445 if (tokenMatcher.start(1, status) >= 0) {
1446 // Scanned a divide sign, indicating a break position in the test data.
1447 if (testString.length()>0) {
1448 breakPositions.addElement(testString.length(), status);
73c04bcf 1449 }
46f4442e
A
1450 }
1451 else if (tokenMatcher.start(2, status) >= 0) {
1452 // Scanned an 'x', meaning no break at this position in the test data
1453 // Nothing to be done here.
1454 }
1455 else if (tokenMatcher.start(3, status) >= 0) {
1456 // Scanned Hex digits. Convert them to binary, append to the character data string.
1457 const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1458 int length = hexNumber.length();
1459 if (length<=8) {
1460 char buf[10];
1461 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1462 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1463 if (c<=0x10ffff) {
1464 testString.append(c);
1465 } else {
1466 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1467 fileName, lineNumber);
1468 }
1469 } else {
1470 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1471 fileName, lineNumber);
1472 }
1473 }
1474 else if (tokenMatcher.start(4, status) >= 0) {
1475 // Scanned to end of a line, possibly skipping over a comment in the process.
1476 // If the line from the file contained test data, run the test now.
2ca993e8 1477 if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
46f4442e 1478 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
73c04bcf
A
1479 }
1480
46f4442e
A
1481 // Clear out this test case.
1482 // The string and breakPositions vector will be refilled as the next
1483 // test case is parsed.
1484 testString.remove();
1485 breakPositions.removeAllElements();
1486 lineNumber++;
1487 } else {
1488 // Scanner catchall. Something unrecognized appeared on the line.
1489 char token[16];
1490 UnicodeString uToken = tokenMatcher.group(0, status);
1491 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1492 token[sizeof(token)-1] = 0;
1493 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token);
1494
1495 // Clean up, in preparation for continuing with the next line.
1496 testString.remove();
1497 breakPositions.removeAllElements();
1498 lineNumber++;
1499 }
1500 TEST_ASSERT_SUCCESS(status);
1501 if (U_FAILURE(status)) {
73c04bcf
A
1502 break;
1503 }
46f4442e 1504 }
73c04bcf 1505
46f4442e
A
1506 delete [] testFile;
1507 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
1508}
73c04bcf 1509
46f4442e
A
1510//--------------------------------------------------------------------------------------------
1511//
1512// checkUnicodeTestCase() Run one test case from one of the Unicode Consortium
1513// test data files. Do only a simple, forward-only check -
1514// this test is mostly to check that ICU and the Unicode
1515// data agree with each other.
1516//
1517//--------------------------------------------------------------------------------------------
1518void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1519 const UnicodeString &testString, // Text data to be broken
1520 UVector32 *breakPositions, // Positions where breaks should be found.
1521 RuleBasedBreakIterator *bi) {
1522 int32_t pos; // Break Position in the test string
1523 int32_t expectedI = 0; // Index of expected break position in the vector of expected results.
1524 int32_t expectedPos; // Expected break position (index into test string)
1525
1526 bi->setText(testString);
1527 pos = bi->first();
1528 pos = bi->next();
1529
1530 while (pos != BreakIterator::DONE) {
1531 if (expectedI >= breakPositions->size()) {
1532 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1533 testFileName, lineNumber, pos);
1534 break;
73c04bcf 1535 }
46f4442e
A
1536 expectedPos = breakPositions->elementAti(expectedI);
1537 if (pos < expectedPos) {
1538 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1539 testFileName, lineNumber, pos);
1540 break;
1541 }
1542 if (pos > expectedPos) {
1543 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1544 testFileName, lineNumber, expectedPos);
73c04bcf
A
1545 break;
1546 }
46f4442e
A
1547 pos = bi->next();
1548 expectedI++;
1549 }
73c04bcf 1550
46f4442e
A
1551 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1552 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1553 testFileName, lineNumber, breakPositions->elementAti(expectedI));
73c04bcf 1554 }
46f4442e 1555}
73c04bcf 1556
73c04bcf 1557
73c04bcf
A
1558
1559#if !UCONFIG_NO_REGULAR_EXPRESSIONS
73c04bcf
A
1560//---------------------------------------------------------------------------------------
1561//
1562// classs RBBIMonkeyKind
1563//
1564// Monkey Test for Break Iteration
1565// Abstract interface class. Concrete derived classes independently
1566// implement the break rules for different iterator types.
1567//
1568// The Monkey Test itself uses doesn't know which type of break iterator it is
1569// testing, but works purely in terms of the interface defined here.
1570//
1571//---------------------------------------------------------------------------------------
1572class RBBIMonkeyKind {
1573public:
1574 // Return a UVector of UnicodeSets, representing the character classes used
1575 // for this type of iterator.
1576 virtual UVector *charClasses() = 0;
1577
1578 // Set the test text on which subsequent calls to next() will operate
1579 virtual void setText(const UnicodeString &s) = 0;
1580
1581 // Find the next break postion, starting from the prev break position, or from zero.
1582 // Return -1 after reaching end of string.
1583 virtual int32_t next(int32_t i) = 0;
1584
340931cb
A
1585 // Name of each character class, parallel with charClasses. Used for debugging output
1586 // of characters.
1587 virtual std::vector<std::string>& characterClassNames();
1588
1589 void setAppliedRule(int32_t position, const char* value);
1590
1591 std::string getAppliedRule(int32_t position);
1592
73c04bcf 1593 virtual ~RBBIMonkeyKind();
340931cb 1594 UErrorCode deferredStatus;
73c04bcf 1595
340931cb
A
1596 std::string classNameFromCodepoint(const UChar32 c);
1597 unsigned int maxClassNameSize();
73c04bcf 1598
340931cb
A
1599 protected:
1600 RBBIMonkeyKind();
1601 std::vector<std::string> classNames;
1602 std::vector<std::string> appliedRules;
1603
1604 // Clear `appliedRules` and fill it with empty strings in the size of test text.
1605 void prepareAppliedRules(int32_t size );
1606
1607 private:
73c04bcf 1608
73c04bcf
A
1609};
1610
1611RBBIMonkeyKind::RBBIMonkeyKind() {
1612 deferredStatus = U_ZERO_ERROR;
1613}
1614
1615RBBIMonkeyKind::~RBBIMonkeyKind() {
1616}
1617
340931cb
A
1618std::vector<std::string>& RBBIMonkeyKind::characterClassNames() {
1619 return classNames;
1620}
1621
1622void RBBIMonkeyKind::prepareAppliedRules(int32_t size) {
1623 // Remove all the information in the `appliedRules`.
1624 appliedRules.clear();
1625 appliedRules.resize(size + 1);
1626}
1627
1628void RBBIMonkeyKind::setAppliedRule(int32_t position, const char* value) {
1629 appliedRules[position] = value;
1630}
1631
1632std::string RBBIMonkeyKind::getAppliedRule(int32_t position){
1633 return appliedRules[position];
1634}
1635
1636std::string RBBIMonkeyKind::classNameFromCodepoint(const UChar32 c) {
1637 // Simply iterate through charClasses to find character's class
1638 for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1639 UnicodeSet *classSet = (UnicodeSet *)charClasses()->elementAt(aClassNum);
1640 if (classSet->contains(c)) {
1641 return classNames[aClassNum];
1642 }
1643 }
1644 U_ASSERT(FALSE); // This should not happen.
1645 return "bad class name";
1646}
1647
1648unsigned int RBBIMonkeyKind::maxClassNameSize() {
1649 unsigned int maxSize = 0;
1650 for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1651 if (classNames[aClassNum].size() > maxSize) {
1652 maxSize = classNames[aClassNum].size();
1653 }
1654 }
1655 return maxSize;
1656}
73c04bcf
A
1657
1658//----------------------------------------------------------------------------------------
1659//
1660// Random Numbers. Similar to standard lib rand() and srand()
1661// Not using library to
1662// 1. Get same results on all platforms.
1663// 2. Get access to current seed, to more easily reproduce failures.
1664//
1665//---------------------------------------------------------------------------------------
1666static uint32_t m_seed = 1;
1667
1668static uint32_t m_rand()
1669{
1670 m_seed = m_seed * 1103515245 + 12345;
1671 return (uint32_t)(m_seed/65536) % 32768;
1672}
1673
1674
1675//------------------------------------------------------------------------------------------
1676//
1677// class RBBICharMonkey Character (Grapheme Cluster) specific implementation
1678// of RBBIMonkeyKind.
1679//
1680//------------------------------------------------------------------------------------------
1681class RBBICharMonkey: public RBBIMonkeyKind {
1682public:
1683 RBBICharMonkey();
1684 virtual ~RBBICharMonkey();
1685 virtual UVector *charClasses();
1686 virtual void setText(const UnicodeString &s);
1687 virtual int32_t next(int32_t i);
1688private:
1689 UVector *fSets;
1690
1691 UnicodeSet *fCRLFSet;
1692 UnicodeSet *fControlSet;
1693 UnicodeSet *fExtendSet;
f3c0d7a5 1694 UnicodeSet *fZWJSet;
51004dcb 1695 UnicodeSet *fRegionalIndicatorSet;
46f4442e
A
1696 UnicodeSet *fPrependSet;
1697 UnicodeSet *fSpacingSet;
1698 UnicodeSet *fLSet;
1699 UnicodeSet *fVSet;
1700 UnicodeSet *fTSet;
1701 UnicodeSet *fLVSet;
1702 UnicodeSet *fLVTSet;
73c04bcf 1703 UnicodeSet *fHangulSet;
f3c0d7a5 1704 UnicodeSet *fExtendedPictSet;
340931cb
A
1705 UnicodeSet *fViramaSet;
1706 UnicodeSet *fLinkingConsonantSet;
1707 UnicodeSet *fExtCccZwjSet;
f3c0d7a5 1708 UnicodeSet *fAnySet;
73c04bcf 1709
73c04bcf
A
1710 const UnicodeString *fText;
1711};
1712
1713
1714RBBICharMonkey::RBBICharMonkey() {
1715 UErrorCode status = U_ZERO_ERROR;
1716
1717 fText = NULL;
73c04bcf 1718
46f4442e 1719 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
f3c0d7a5
A
1720 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
1721 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
1722 fZWJSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
1723 fRegionalIndicatorSet =
1724 new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
46f4442e
A
1725 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
1726 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
1727 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
1728 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
1729 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
1730 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
1731 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
1732 fHangulSet = new UnicodeSet();
1733 fHangulSet->addAll(*fLSet);
1734 fHangulSet->addAll(*fVSet);
1735 fHangulSet->addAll(*fTSet);
1736 fHangulSet->addAll(*fLVSet);
1737 fHangulSet->addAll(*fLVTSet);
2ca993e8 1738
0f5d89e8 1739 fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
340931cb
A
1740 fViramaSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1741 "\\p{Indic_Syllabic_Category=Virama}]", status);
1742 fLinkingConsonantSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1743 "\\p{Indic_Syllabic_Category=Consonant}]", status);
1744 fExtCccZwjSet = new UnicodeSet(u"[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]", status);
f3c0d7a5 1745 fAnySet = new UnicodeSet(0, 0x10ffff);
2ca993e8 1746
340931cb
A
1747 // Create sets of characters, and add the names of the above character sets.
1748 // In each new ICU release, add new names corresponding to the sets above.
f3c0d7a5 1749 fSets = new UVector(status);
340931cb
A
1750
1751 // Important: Keep class names the same as the class contents.
1752 fSets->addElement(fCRLFSet, status); classNames.push_back("CRLF");
1753 fSets->addElement(fControlSet, status); classNames.push_back("Control");
1754 fSets->addElement(fExtendSet, status); classNames.push_back("Extended");
1755 fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
4388f060 1756 if (!fPrependSet->isEmpty()) {
340931cb 1757 fSets->addElement(fPrependSet, status); classNames.push_back("Prepend");
4388f060 1758 }
340931cb
A
1759 fSets->addElement(fSpacingSet, status); classNames.push_back("Spacing");
1760 fSets->addElement(fHangulSet, status); classNames.push_back("Hangul");
1761 fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
1762 fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
1763 fSets->addElement(fViramaSet, status); classNames.push_back("Virama");
1764 fSets->addElement(fLinkingConsonantSet, status); classNames.push_back("LinkingConsonant");
1765 fSets->addElement(fExtCccZwjSet, status); classNames.push_back("ExtCcccZwj");
1766 fSets->addElement(fAnySet, status); classNames.push_back("Any");
1767
73c04bcf
A
1768 if (U_FAILURE(status)) {
1769 deferredStatus = status;
1770 }
1771}
1772
1773
1774void RBBICharMonkey::setText(const UnicodeString &s) {
1775 fText = &s;
340931cb 1776 prepareAppliedRules(s.length());
73c04bcf
A
1777}
1778
1779
73c04bcf 1780
46f4442e
A
1781int32_t RBBICharMonkey::next(int32_t prevPos) {
1782 int p0, p1, p2, p3; // Indices of the significant code points around the
1783 // break position being tested. The candidate break
1784 // location is before p2.
1785
1786 int breakPos = -1;
1787
1788 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2ca993e8
A
1789 UChar32 cBase; // for (X Extend*) patterns, the X character.
1790
46f4442e
A
1791 if (U_FAILURE(deferredStatus)) {
1792 return -1;
73c04bcf 1793 }
46f4442e
A
1794
1795 // Previous break at end of string. return DONE.
1796 if (prevPos >= fText->length()) {
1797 return -1;
73c04bcf 1798 }
340931cb 1799
46f4442e
A
1800 p0 = p1 = p2 = p3 = prevPos;
1801 c3 = fText->char32At(prevPos);
2ca993e8 1802 c0 = c1 = c2 = cBase = 0;
57a6839d
A
1803 (void)p0; // suppress set but not used warning.
1804 (void)c0;
46f4442e
A
1805
1806 // Loop runs once per "significant" character position in the input text.
1807 for (;;) {
1808 // Move all of the positions forward in the input string.
1809 p0 = p1; c0 = c1;
1810 p1 = p2; c1 = c2;
1811 p2 = p3; c2 = c3;
1812
340931cb 1813 // Advance p3 by one codepoint
46f4442e
A
1814 p3 = fText->moveIndex32(p3, 1);
1815 c3 = fText->char32At(p3);
1816
1817 if (p1 == p2) {
1818 // Still warming up the loop. (won't work with zero length strings, but we don't care)
1819 continue;
1820 }
340931cb 1821
46f4442e 1822 if (p2 == fText->length()) {
340931cb 1823 setAppliedRule(p2, "End of String");
46f4442e
A
1824 break;
1825 }
1826
46f4442e
A
1827 // No Extend or Format characters may appear between the CR and LF,
1828 // which requires the additional check for p2 immediately following p1.
1829 //
1830 if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
340931cb
A
1831 setAppliedRule(p2, "GB3 CR x LF");
1832 continue;
46f4442e
A
1833 }
1834
46f4442e
A
1835 if (fControlSet->contains(c1) ||
1836 c1 == 0x0D ||
1837 c1 == 0x0A) {
340931cb
A
1838 setAppliedRule(p2, "GB4 ( Control | CR | LF ) <break>");
1839 break;
46f4442e
A
1840 }
1841
46f4442e
A
1842 if (fControlSet->contains(c2) ||
1843 c2 == 0x0D ||
1844 c2 == 0x0A) {
340931cb 1845 setAppliedRule(p2, "GB5 <break> ( Control | CR | LF )");
46f4442e
A
1846 break;
1847 }
1848
46f4442e
A
1849 if (fLSet->contains(c1) &&
1850 (fLSet->contains(c2) ||
1851 fVSet->contains(c2) ||
1852 fLVSet->contains(c2) ||
1853 fLVTSet->contains(c2))) {
340931cb 1854 setAppliedRule(p2, "GB6 L x ( L | V | LV | LVT )");
46f4442e
A
1855 continue;
1856 }
1857
46f4442e
A
1858 if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
1859 (fVSet->contains(c2) || fTSet->contains(c2))) {
340931cb 1860 setAppliedRule(p2, "GB7 ( LV | V ) x ( V | T )");
46f4442e
A
1861 continue;
1862 }
1863
46f4442e
A
1864 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
1865 fTSet->contains(c2)) {
340931cb 1866 setAppliedRule(p2, "GB8 ( LVT | T) x T");
46f4442e
A
1867 continue;
1868 }
1869
2ca993e8
A
1870 if (fExtendSet->contains(c2) || fZWJSet->contains(c2)) {
1871 if (!fExtendSet->contains(c1)) {
1872 cBase = c1;
1873 }
340931cb 1874 setAppliedRule(p2, "GB9 x (Extend | ZWJ)");
46f4442e
A
1875 continue;
1876 }
1877
46f4442e 1878 if (fSpacingSet->contains(c2)) {
340931cb 1879 setAppliedRule(p2, "GB9a x SpacingMark");
46f4442e
A
1880 continue;
1881 }
1882
46f4442e 1883 if (fPrependSet->contains(c1)) {
340931cb 1884 setAppliedRule(p2, "GB9b Prepend x");
46f4442e
A
1885 continue;
1886 }
1887
340931cb
A
1888 // Note: Viramas are also included in the ExtCccZwj class.
1889 if (fLinkingConsonantSet->contains(c2)) {
1890 int pi = p1;
1891 bool sawVirama = false;
1892 while (pi > 0 && fExtCccZwjSet->contains(fText->char32At(pi))) {
1893 if (fViramaSet->contains(fText->char32At(pi))) {
1894 sawVirama = true;
1895 }
1896 pi = fText->moveIndex32(pi, -1);
1897 }
1898 if (sawVirama && fLinkingConsonantSet->contains(fText->char32At(pi))) {
1899 setAppliedRule(p2, "GB9.3 LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant");
1900 continue;
1901 }
1902 }
1903
0f5d89e8 1904 if (fExtendedPictSet->contains(cBase) && fZWJSet->contains(c1) && fExtendedPictSet->contains(c2)) {
340931cb
A
1905 setAppliedRule(p2, "GB11 Extended_Pictographic Extend * ZWJ x Extended_Pictographic");
1906 continue;
2ca993e8
A
1907 }
1908
2ca993e8
A
1909 // Note: The first if condition is a little tricky. We only need to force
1910 // a break if there are three or more contiguous RIs. If there are
1911 // only two, a break following will occur via other rules, and will include
1912 // any trailing extend characters, which is needed behavior.
0f5d89e8 1913 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
2ca993e8 1914 && fRegionalIndicatorSet->contains(c2)) {
340931cb
A
1915 setAppliedRule(p2, "GB12-13 Regional_Indicator x Regional_Indicator");
1916 break;
2ca993e8
A
1917 }
1918 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
340931cb
A
1919 setAppliedRule(p2, "GB12-13 Regional_Indicator x Regional_Indicator");
1920 continue;
2ca993e8
A
1921 }
1922
340931cb 1923 setAppliedRule(p2, "GB999 Any <break> Any");
46f4442e
A
1924 break;
1925 }
1926
1927 breakPos = p2;
1928 return breakPos;
73c04bcf
A
1929}
1930
1931
46f4442e 1932
73c04bcf
A
1933UVector *RBBICharMonkey::charClasses() {
1934 return fSets;
1935}
1936
73c04bcf
A
1937RBBICharMonkey::~RBBICharMonkey() {
1938 delete fSets;
1939 delete fCRLFSet;
1940 delete fControlSet;
1941 delete fExtendSet;
51004dcb 1942 delete fRegionalIndicatorSet;
46f4442e
A
1943 delete fPrependSet;
1944 delete fSpacingSet;
1945 delete fLSet;
1946 delete fVSet;
1947 delete fTSet;
1948 delete fLVSet;
1949 delete fLVTSet;
73c04bcf
A
1950 delete fHangulSet;
1951 delete fAnySet;
2ca993e8 1952 delete fZWJSet;
f3c0d7a5 1953 delete fExtendedPictSet;
340931cb
A
1954 delete fViramaSet;
1955 delete fLinkingConsonantSet;
1956 delete fExtCccZwjSet;
73c04bcf
A
1957}
1958
1959//------------------------------------------------------------------------------------------
1960//
1961// class RBBIWordMonkey Word Break specific implementation
1962// of RBBIMonkeyKind.
1963//
1964//------------------------------------------------------------------------------------------
1965class RBBIWordMonkey: public RBBIMonkeyKind {
1966public:
1967 RBBIWordMonkey();
1968 virtual ~RBBIWordMonkey();
1969 virtual UVector *charClasses();
1970 virtual void setText(const UnicodeString &s);
1971 virtual int32_t next(int32_t i);
1972private:
1973 UVector *fSets;
1974
46f4442e
A
1975 UnicodeSet *fCRSet;
1976 UnicodeSet *fLFSet;
1977 UnicodeSet *fNewlineSet;
57a6839d 1978 UnicodeSet *fRegionalIndicatorSet;
73c04bcf 1979 UnicodeSet *fKatakanaSet;
57a6839d 1980 UnicodeSet *fHebrew_LetterSet;
73c04bcf 1981 UnicodeSet *fALetterSet;
57a6839d
A
1982 UnicodeSet *fSingle_QuoteSet;
1983 UnicodeSet *fDouble_QuoteSet;
46f4442e 1984 UnicodeSet *fMidNumLetSet;
73c04bcf
A
1985 UnicodeSet *fMidLetterSet;
1986 UnicodeSet *fMidNumSet;
1987 UnicodeSet *fNumericSet;
1988 UnicodeSet *fFormatSet;
1989 UnicodeSet *fOtherSet;
1990 UnicodeSet *fExtendSet;
1991 UnicodeSet *fExtendNumLetSet;
0f5d89e8 1992 UnicodeSet *fWSegSpaceSet;
f3c0d7a5 1993 UnicodeSet *fDictionarySet;
f3c0d7a5
A
1994 UnicodeSet *fZWJSet;
1995 UnicodeSet *fExtendedPictSet;
73c04bcf 1996
73c04bcf
A
1997 const UnicodeString *fText;
1998};
1999
2000
46f4442e 2001RBBIWordMonkey::RBBIWordMonkey()
73c04bcf
A
2002{
2003 UErrorCode status = U_ZERO_ERROR;
2004
73c04bcf
A
2005 fSets = new UVector(status);
2006
f3c0d7a5
A
2007 fCRSet = new UnicodeSet(u"[\\p{Word_Break = CR}]", status);
2008 fLFSet = new UnicodeSet(u"[\\p{Word_Break = LF}]", status);
2009 fNewlineSet = new UnicodeSet(u"[\\p{Word_Break = Newline}]", status);
2010 fKatakanaSet = new UnicodeSet(u"[\\p{Word_Break = Katakana}]", status);
2011 fRegionalIndicatorSet = new UnicodeSet(u"[\\p{Word_Break = Regional_Indicator}]", status);
2012 fHebrew_LetterSet = new UnicodeSet(u"[\\p{Word_Break = Hebrew_Letter}]", status);
2013 fALetterSet = new UnicodeSet(u"[\\p{Word_Break = ALetter}]", status);
2014 fSingle_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Single_Quote}]", status);
2015 fDouble_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Double_Quote}]", status);
2016 fMidNumLetSet = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]", status);
2017 fMidLetterSet = new UnicodeSet(u"[\\p{Word_Break = MidLetter} - [\\:]]", status);
2018 fMidNumSet = new UnicodeSet(u"[\\p{Word_Break = MidNum}]", status);
340931cb 2019 fNumericSet = new UnicodeSet(u"[\\p{Word_Break = Numeric}]", status);
f3c0d7a5
A
2020 fFormatSet = new UnicodeSet(u"[\\p{Word_Break = Format}]", status);
2021 fExtendNumLetSet = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
340931cb
A
2022 // There are some sc=Hani characters with WB=Extend.
2023 // The break rules need to pick one or the other because
2024 // Extend overlapping with something else is messy.
2025 // For Unicode 13, we chose to keep U+16FF0 & U+16FF1
2026 // in $Han (for $dictionary) and out of $Extend.
2027 fExtendSet = new UnicodeSet(u"[\\p{Word_Break = Extend}-[:Hani:]]", status);
0f5d89e8 2028 fWSegSpaceSet = new UnicodeSet(u"[\\p{Word_Break = WSegSpace}]", status);
f3c0d7a5 2029
f3c0d7a5 2030 fZWJSet = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]", status);
0f5d89e8 2031 fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
f3c0d7a5
A
2032
2033 fDictionarySet = new UnicodeSet(u"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
2034 fDictionarySet->addAll(*fKatakanaSet);
2035 fDictionarySet->addAll(UnicodeSet(u"[\\p{LineBreak = Complex_Context}]", status));
2036
2037 fALetterSet->removeAll(*fDictionarySet);
2ca993e8 2038
73c04bcf
A
2039 fOtherSet = new UnicodeSet();
2040 if(U_FAILURE(status)) {
f3c0d7a5
A
2041 IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
2042 deferredStatus = status;
2043 return;
73c04bcf
A
2044 }
2045
2046 fOtherSet->complement();
46f4442e
A
2047 fOtherSet->removeAll(*fCRSet);
2048 fOtherSet->removeAll(*fLFSet);
2049 fOtherSet->removeAll(*fNewlineSet);
73c04bcf 2050 fOtherSet->removeAll(*fKatakanaSet);
57a6839d 2051 fOtherSet->removeAll(*fHebrew_LetterSet);
73c04bcf 2052 fOtherSet->removeAll(*fALetterSet);
57a6839d
A
2053 fOtherSet->removeAll(*fSingle_QuoteSet);
2054 fOtherSet->removeAll(*fDouble_QuoteSet);
73c04bcf
A
2055 fOtherSet->removeAll(*fMidLetterSet);
2056 fOtherSet->removeAll(*fMidNumSet);
2057 fOtherSet->removeAll(*fNumericSet);
2058 fOtherSet->removeAll(*fExtendNumLetSet);
0f5d89e8 2059 fOtherSet->removeAll(*fWSegSpaceSet);
73c04bcf
A
2060 fOtherSet->removeAll(*fFormatSet);
2061 fOtherSet->removeAll(*fExtendSet);
51004dcb 2062 fOtherSet->removeAll(*fRegionalIndicatorSet);
f3c0d7a5
A
2063 fOtherSet->removeAll(*fZWJSet);
2064 fOtherSet->removeAll(*fExtendedPictSet);
f3c0d7a5 2065
46f4442e 2066 // Inhibit dictionary characters from being tested at all.
f3c0d7a5 2067 fOtherSet->removeAll(*fDictionarySet);
73c04bcf 2068
340931cb
A
2069 // Add classes and their names
2070 fSets->addElement(fCRSet, status); classNames.push_back("CR");
2071 fSets->addElement(fLFSet, status); classNames.push_back("LF");
2072 fSets->addElement(fNewlineSet, status); classNames.push_back("Newline");
2073 fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
2074 fSets->addElement(fHebrew_LetterSet, status); classNames.push_back("Hebrew");
2075 fSets->addElement(fALetterSet, status); classNames.push_back("ALetter");
2076 fSets->addElement(fSingle_QuoteSet, status); classNames.push_back("Single Quote");
2077 fSets->addElement(fDouble_QuoteSet, status); classNames.push_back("Double Quote");
2078 // Omit Katakana from fSets, which omits Katakana characters
2079 // from the test data. They are all in the dictionary set,
2080 // which this (old, to be retired) monkey test cannot handle.
2081 //fSets->addElement(fKatakanaSet, status);
2082
2083 fSets->addElement(fMidLetterSet, status); classNames.push_back("MidLetter");
2084 fSets->addElement(fMidNumLetSet, status); classNames.push_back("MidNumLet");
2085 fSets->addElement(fMidNumSet, status); classNames.push_back("MidNum");
2086 fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
2087 fSets->addElement(fFormatSet, status); classNames.push_back("Format");
2088 fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
2089 fSets->addElement(fOtherSet, status); classNames.push_back("Other");
2090 fSets->addElement(fExtendNumLetSet, status); classNames.push_back("ExtendNumLet");
2091 fSets->addElement(fWSegSpaceSet, status); classNames.push_back("WSegSpace");
2092
2093 fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
2094 fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
2ca993e8 2095
73c04bcf
A
2096 if (U_FAILURE(status)) {
2097 deferredStatus = status;
2098 }
2099}
2100
2101void RBBIWordMonkey::setText(const UnicodeString &s) {
2102 fText = &s;
340931cb 2103 prepareAppliedRules(s.length());
73c04bcf
A
2104}
2105
2106
2107int32_t RBBIWordMonkey::next(int32_t prevPos) {
2108 int p0, p1, p2, p3; // Indices of the significant code points around the
2109 // break position being tested. The candidate break
2110 // location is before p2.
2111
2112 int breakPos = -1;
2113
2114 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2ca993e8 2115
46f4442e
A
2116 if (U_FAILURE(deferredStatus)) {
2117 return -1;
2118 }
73c04bcf
A
2119
2120 // Prev break at end of string. return DONE.
2121 if (prevPos >= fText->length()) {
2122 return -1;
2123 }
2124 p0 = p1 = p2 = p3 = prevPos;
2125 c3 = fText->char32At(prevPos);
2126 c0 = c1 = c2 = 0;
57a6839d 2127 (void)p0; // Suppress set but not used warning.
73c04bcf
A
2128
2129 // Loop runs once per "significant" character position in the input text.
2130 for (;;) {
2131 // Move all of the positions forward in the input string.
2132 p0 = p1; c0 = c1;
2133 p1 = p2; c1 = c2;
2134 p2 = p3; c2 = c3;
2135
340931cb 2136 // Advance p3 by X(Extend | Format)* Rule 4
46f4442e 2137 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
73c04bcf
A
2138 do {
2139 p3 = fText->moveIndex32(p3, 1);
2140 c3 = fText->char32At(p3);
46f4442e
A
2141 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2142 break;
340931cb 2143 }
73c04bcf 2144 }
f3c0d7a5 2145 while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
73c04bcf
A
2146
2147
2148 if (p1 == p2) {
2149 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2150 continue;
2151 }
340931cb 2152
73c04bcf
A
2153 if (p2 == fText->length()) {
2154 // Reached end of string. Always a break position.
2155 break;
2156 }
46f4442e 2157
73c04bcf
A
2158 // No Extend or Format characters may appear between the CR and LF,
2159 // which requires the additional check for p2 immediately following p1.
2160 //
46f4442e 2161 if (c1==0x0D && c2==0x0A) {
340931cb
A
2162 setAppliedRule(p2, "WB3 CR x LF");
2163 continue;
73c04bcf 2164 }
2ca993e8 2165
46f4442e 2166 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
340931cb 2167 setAppliedRule(p2, "WB3a Break before and after newlines (including CR and LF)");
46f4442e 2168 break;
340931cb 2169 }
46f4442e 2170 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
340931cb 2171 setAppliedRule(p2, "WB3a Break before and after newlines (including CR and LF)");
46f4442e 2172 break;
340931cb 2173 }
73c04bcf 2174
2ca993e8
A
2175 // Not ignoring extend chars, so peek into input text to
2176 // get the potential ZWJ, the character immediately preceding c2.
2177 // Sloppy UChar32 indexing: p2-1 may reference trail half
2178 // but char32At will get the full code point.
340931cb
A
2179 if (fZWJSet->contains(fText->char32At(p2 - 1)) && fExtendedPictSet->contains(c2)){
2180 setAppliedRule(p2, "WB3c ZWJ x Extended_Pictographic");
0f5d89e8
A
2181 continue;
2182 }
2183
0f5d89e8 2184 if (fWSegSpaceSet->contains(fText->char32At(p2-1)) && fWSegSpaceSet->contains(c2)) {
340931cb 2185 setAppliedRule(p2, "WB3d Keep horizontal whitespace together.");
2ca993e8
A
2186 continue;
2187 }
2188
57a6839d
A
2189 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2190 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
340931cb 2191 setAppliedRule(p2, "WB4 (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)");
73c04bcf
A
2192 continue;
2193 }
2194
57a6839d
A
2195 if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2196 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2197 (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
340931cb
A
2198 setAppliedRule(p2,
2199 "WB6 (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter _Letter)");
57a6839d
A
2200 continue;
2201 }
2202
57a6839d
A
2203 if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2204 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2205 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
340931cb
A
2206 setAppliedRule(p2,
2207 "WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter)");
73c04bcf
A
2208 continue;
2209 }
2210
57a6839d 2211 if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
340931cb 2212 setAppliedRule(p2, "WB7a Hebrew_Letter x Single_Quote");
57a6839d
A
2213 continue;
2214 }
73c04bcf 2215
340931cb
A
2216 if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2217 setAppliedRule(p2, "WB7b Hebrew_Letter x Double_Quote Hebrew_Letter");
57a6839d
A
2218 continue;
2219 }
2220
57a6839d 2221 if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
340931cb 2222 setAppliedRule(p2, "WB7c Hebrew_Letter Double_Quote x Hebrew_Letter");
73c04bcf
A
2223 continue;
2224 }
2225
73c04bcf 2226 if (fNumericSet->contains(c1) &&
340931cb
A
2227 fNumericSet->contains(c2)) {
2228 setAppliedRule(p2, "WB8 Numeric x Numeric");
73c04bcf
A
2229 continue;
2230 }
2231
57a6839d 2232 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
340931cb
A
2233 fNumericSet->contains(c2)) {
2234 setAppliedRule(p2, "WB9 (ALetter | Hebrew_Letter) x Numeric");
73c04bcf
A
2235 continue;
2236 }
2237
73c04bcf 2238 if (fNumericSet->contains(c1) &&
57a6839d 2239 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
340931cb 2240 setAppliedRule(p2, "WB10 Numeric x (ALetter | Hebrew_Letter)");
73c04bcf
A
2241 continue;
2242 }
2243
340931cb 2244 if (fNumericSet->contains(c0) &&
57a6839d 2245 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
73c04bcf 2246 fNumericSet->contains(c2)) {
340931cb 2247 setAppliedRule(p2, "WB11 Numeric (MidNum | MidNumLet | Single_Quote) x Numeric");
73c04bcf
A
2248 continue;
2249 }
2250
73c04bcf 2251 if (fNumericSet->contains(c1) &&
57a6839d 2252 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
73c04bcf 2253 fNumericSet->contains(c3)) {
340931cb 2254 setAppliedRule(p2, "WB12 Numeric x (MidNum | MidNumLet | SingleQuote) Numeric");
73c04bcf
A
2255 continue;
2256 }
2257
f3c0d7a5
A
2258 // Note: matches UAX 29 rules, but doesn't come into play for ICU because
2259 // all Katakana are handled by the dictionary breaker.
73c04bcf
A
2260 if (fKatakanaSet->contains(c1) &&
2261 fKatakanaSet->contains(c2)) {
340931cb 2262 setAppliedRule(p2, "WB13 Katakana x Katakana");
73c04bcf
A
2263 continue;
2264 }
2265
57a6839d 2266 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
73c04bcf
A
2267 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2268 fExtendNumLetSet->contains(c2)) {
340931cb
A
2269 setAppliedRule(p2,
2270 "WB13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet");
2271 continue;
51004dcb 2272 }
73c04bcf 2273
73c04bcf 2274 if (fExtendNumLetSet->contains(c1) &&
57a6839d
A
2275 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2276 fNumericSet->contains(c2) || fKatakanaSet->contains(c2))) {
340931cb 2277 setAppliedRule(p2, "WB13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)");
57a6839d 2278 continue;
51004dcb
A
2279 }
2280
2ca993e8 2281 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
340931cb 2282 setAppliedRule(p2, "WB15 - WB17 Group pairs of Regional Indicators.");
2ca993e8
A
2283 break;
2284 }
51004dcb 2285 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
340931cb 2286 setAppliedRule(p2, "WB15 - WB17 Group pairs of Regional Indicators.");
51004dcb
A
2287 continue;
2288 }
73c04bcf 2289
340931cb 2290 setAppliedRule(p2, "WB999");
73c04bcf
A
2291 break;
2292 }
2293
2294 breakPos = p2;
2295 return breakPos;
2296}
2297
2298
2299UVector *RBBIWordMonkey::charClasses() {
2300 return fSets;
2301}
2302
73c04bcf
A
2303RBBIWordMonkey::~RBBIWordMonkey() {
2304 delete fSets;
46f4442e
A
2305 delete fCRSet;
2306 delete fLFSet;
2307 delete fNewlineSet;
73c04bcf 2308 delete fKatakanaSet;
57a6839d 2309 delete fHebrew_LetterSet;
73c04bcf 2310 delete fALetterSet;
57a6839d
A
2311 delete fSingle_QuoteSet;
2312 delete fDouble_QuoteSet;
46f4442e 2313 delete fMidNumLetSet;
73c04bcf
A
2314 delete fMidLetterSet;
2315 delete fMidNumSet;
2316 delete fNumericSet;
2317 delete fFormatSet;
2318 delete fExtendSet;
2319 delete fExtendNumLetSet;
0f5d89e8 2320 delete fWSegSpaceSet;
51004dcb 2321 delete fRegionalIndicatorSet;
f3c0d7a5 2322 delete fDictionarySet;
73c04bcf 2323 delete fOtherSet;
f3c0d7a5
A
2324 delete fZWJSet;
2325 delete fExtendedPictSet;
73c04bcf
A
2326}
2327
2328
2329
2330
2331//------------------------------------------------------------------------------------------
2332//
2333// class RBBISentMonkey Sentence Break specific implementation
2334// of RBBIMonkeyKind.
2335//
2336//------------------------------------------------------------------------------------------
2337class RBBISentMonkey: public RBBIMonkeyKind {
2338public:
2339 RBBISentMonkey();
2340 virtual ~RBBISentMonkey();
2341 virtual UVector *charClasses();
2342 virtual void setText(const UnicodeString &s);
2343 virtual int32_t next(int32_t i);
2344private:
2345 int moveBack(int posFrom);
2346 int moveForward(int posFrom);
2347 UChar32 cAt(int pos);
2348
2349 UVector *fSets;
2350
2351 UnicodeSet *fSepSet;
2352 UnicodeSet *fFormatSet;
2353 UnicodeSet *fSpSet;
2354 UnicodeSet *fLowerSet;
2355 UnicodeSet *fUpperSet;
2356 UnicodeSet *fOLetterSet;
2357 UnicodeSet *fNumericSet;
2358 UnicodeSet *fATermSet;
46f4442e 2359 UnicodeSet *fSContinueSet;
73c04bcf
A
2360 UnicodeSet *fSTermSet;
2361 UnicodeSet *fCloseSet;
2362 UnicodeSet *fOtherSet;
2363 UnicodeSet *fExtendSet;
2364
2365 const UnicodeString *fText;
73c04bcf
A
2366};
2367
2368RBBISentMonkey::RBBISentMonkey()
2369{
2370 UErrorCode status = U_ZERO_ERROR;
2371
2372 fSets = new UVector(status);
2373
46f4442e
A
2374 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
2375 // set and made into character classes of their own. For the monkey impl,
2376 // they remain in SEP, since Sep always appears with CR and LF in the rules.
2377 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status);
2378 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status);
2379 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status);
2380 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status);
2381 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status);
2382 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status);
2383 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status);
2384 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status);
2385 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2386 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status);
2387 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status);
2388 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status);
73c04bcf
A
2389 fOtherSet = new UnicodeSet();
2390
2391 if(U_FAILURE(status)) {
2392 deferredStatus = status;
2393 return;
2394 }
2395
2396 fOtherSet->complement();
2397 fOtherSet->removeAll(*fSepSet);
2398 fOtherSet->removeAll(*fFormatSet);
2399 fOtherSet->removeAll(*fSpSet);
2400 fOtherSet->removeAll(*fLowerSet);
2401 fOtherSet->removeAll(*fUpperSet);
2402 fOtherSet->removeAll(*fOLetterSet);
2403 fOtherSet->removeAll(*fNumericSet);
2404 fOtherSet->removeAll(*fATermSet);
46f4442e 2405 fOtherSet->removeAll(*fSContinueSet);
73c04bcf
A
2406 fOtherSet->removeAll(*fSTermSet);
2407 fOtherSet->removeAll(*fCloseSet);
2408 fOtherSet->removeAll(*fExtendSet);
2409
340931cb
A
2410 fSets->addElement(fSepSet, status); classNames.push_back("Sep");
2411 fSets->addElement(fFormatSet, status); classNames.push_back("Format");
2412 fSets->addElement(fSpSet, status); classNames.push_back("Sp");
2413 fSets->addElement(fLowerSet, status); classNames.push_back("Lower");
2414 fSets->addElement(fUpperSet, status); classNames.push_back("Upper");
2415 fSets->addElement(fOLetterSet, status); classNames.push_back("OLetter");
2416 fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
2417 fSets->addElement(fATermSet, status); classNames.push_back("ATerm");
2418 fSets->addElement(fSContinueSet, status); classNames.push_back("SContinue");
2419 fSets->addElement(fSTermSet, status); classNames.push_back("STerm");
2420 fSets->addElement(fCloseSet, status); classNames.push_back("Close");
2421 fSets->addElement(fOtherSet, status); classNames.push_back("Other");
2422 fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
73c04bcf
A
2423
2424 if (U_FAILURE(status)) {
2425 deferredStatus = status;
2426 }
2427}
2428
2429
2430
2431void RBBISentMonkey::setText(const UnicodeString &s) {
2432 fText = &s;
340931cb 2433 prepareAppliedRules(s.length());
73c04bcf
A
2434}
2435
2436UVector *RBBISentMonkey::charClasses() {
2437 return fSets;
2438}
2439
73c04bcf
A
2440// moveBack() Find the "significant" code point preceding the index i.
2441// Skips over ($Extend | $Format)* .
46f4442e 2442//
73c04bcf
A
2443int RBBISentMonkey::moveBack(int i) {
2444 if (i <= 0) {
2445 return -1;
2446 }
2447 UChar32 c;
2448 int32_t j = i;
2449 do {
2450 j = fText->moveIndex32(j, -1);
2451 c = fText->char32At(j);
2452 }
2453 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2454 return j;
2455
2456 }
2457
2458
2459int RBBISentMonkey::moveForward(int i) {
2460 if (i>=fText->length()) {
2461 return fText->length();
2462 }
2463 UChar32 c;
2464 int32_t j = i;
2465 do {
2466 j = fText->moveIndex32(j, 1);
2467 c = cAt(j);
2468 }
2469 while (fFormatSet->contains(c) || fExtendSet->contains(c));
2470 return j;
2471}
2472
2473UChar32 RBBISentMonkey::cAt(int pos) {
2474 if (pos<0 || pos>=fText->length()) {
2475 return -1;
2476 } else {
2477 return fText->char32At(pos);
2478 }
2479}
2480
2481int32_t RBBISentMonkey::next(int32_t prevPos) {
2482 int p0, p1, p2, p3; // Indices of the significant code points around the
2483 // break position being tested. The candidate break
2484 // location is before p2.
2485
2486 int breakPos = -1;
2487
2488 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2489 UChar32 c;
2490
46f4442e
A
2491 if (U_FAILURE(deferredStatus)) {
2492 return -1;
2493 }
2494
73c04bcf
A
2495 // Prev break at end of string. return DONE.
2496 if (prevPos >= fText->length()) {
2497 return -1;
2498 }
2499 p0 = p1 = p2 = p3 = prevPos;
2500 c3 = fText->char32At(prevPos);
2501 c0 = c1 = c2 = 0;
57a6839d 2502 (void)p0; // Suppress set but not used warning.
73c04bcf
A
2503
2504 // Loop runs once per "significant" character position in the input text.
2505 for (;;) {
2506 // Move all of the positions forward in the input string.
2507 p0 = p1; c0 = c1;
2508 p1 = p2; c1 = c2;
2509 p2 = p3; c2 = c3;
46f4442e 2510
340931cb 2511 // Advance p3 by X(Extend | Format)* Rule 4
73c04bcf
A
2512 p3 = moveForward(p3);
2513 c3 = cAt(p3);
2514
73c04bcf 2515 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
340931cb 2516 setAppliedRule(p2, "SB3 CR x LF");
73c04bcf
A
2517 continue;
2518 }
46f4442e 2519
73c04bcf
A
2520 if (fSepSet->contains(c1)) {
2521 p2 = p1+1; // Separators don't combine with Extend or Format.
340931cb
A
2522
2523 setAppliedRule(p2, "SB4 Sep <break>");
73c04bcf
A
2524 break;
2525 }
2526
2527 if (p2 >= fText->length()) {
2528 // Reached end of string. Always a break position.
340931cb 2529 setAppliedRule(p2, "SB4 Sep <break>");
73c04bcf
A
2530 break;
2531 }
2532
2533 if (p2 == prevPos) {
2534 // Still warming up the loop. (won't work with zero length strings, but we don't care)
340931cb 2535 setAppliedRule(p2, "SB4 Sep <break>");
73c04bcf
A
2536 continue;
2537 }
46f4442e 2538
73c04bcf 2539 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) {
340931cb 2540 setAppliedRule(p2, "SB6 ATerm x Numeric");
73c04bcf
A
2541 continue;
2542 }
2543
340931cb 2544 if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
2ca993e8 2545 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
340931cb 2546 setAppliedRule(p2, "SB7 (Upper | Lower) ATerm x Uppper");
73c04bcf
A
2547 continue;
2548 }
2549
73c04bcf
A
2550 // Note: STerm | ATerm are added to the negated part of the expression by a
2551 // note to the Unicode 5.0 documents.
2552 int p8 = p1;
2553 while (fSpSet->contains(cAt(p8))) {
2554 p8 = moveBack(p8);
2555 }
2556 while (fCloseSet->contains(cAt(p8))) {
2557 p8 = moveBack(p8);
2558 }
2559 if (fATermSet->contains(cAt(p8))) {
2560 p8=p2;
2561 for (;;) {
2562 c = cAt(p8);
2563 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2564 fLowerSet->contains(c) || fSepSet->contains(c) ||
2565 fATermSet->contains(c) || fSTermSet->contains(c)) {
340931cb
A
2566
2567 setAppliedRule(p2,
2568 "SB8 ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
73c04bcf
A
2569 break;
2570 }
2571 p8 = moveForward(p8);
2572 }
2573 if (fLowerSet->contains(cAt(p8))) {
340931cb
A
2574
2575 setAppliedRule(p2,
2576 "SB8 ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
73c04bcf
A
2577 continue;
2578 }
2579 }
46f4442e 2580
46f4442e 2581 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
73c04bcf
A
2582 p8 = p1;
2583 while (fSpSet->contains(cAt(p8))) {
2584 p8 = moveBack(p8);
2585 }
2586 while (fCloseSet->contains(cAt(p8))) {
2587 p8 = moveBack(p8);
2588 }
2589 c = cAt(p8);
2590 if (fSTermSet->contains(c) || fATermSet->contains(c)) {
340931cb 2591 setAppliedRule(p2, "SB8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm)");
73c04bcf
A
2592 continue;
2593 }
2594 }
2595
73c04bcf
A
2596 int p9 = p1;
2597 while (fCloseSet->contains(cAt(p9))) {
2598 p9 = moveBack(p9);
2599 }
2600 c = cAt(p9);
2601 if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2602 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
340931cb
A
2603
2604 setAppliedRule(p2, "SB9 (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)");
73c04bcf
A
2605 continue;
2606 }
2607 }
2608
73c04bcf
A
2609 int p10 = p1;
2610 while (fSpSet->contains(cAt(p10))) {
2611 p10 = moveBack(p10);
2612 }
2613 while (fCloseSet->contains(cAt(p10))) {
2614 p10 = moveBack(p10);
2615 }
2616 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2617 if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
340931cb 2618 setAppliedRule(p2, "SB10 (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)");
73c04bcf
A
2619 continue;
2620 }
2621 }
2622
73c04bcf 2623 int p11 = p1;
46f4442e
A
2624 if (fSepSet->contains(cAt(p11))) {
2625 p11 = moveBack(p11);
2626 }
73c04bcf
A
2627 while (fSpSet->contains(cAt(p11))) {
2628 p11 = moveBack(p11);
2629 }
2630 while (fCloseSet->contains(cAt(p11))) {
2631 p11 = moveBack(p11);
2632 }
2633 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
340931cb 2634 setAppliedRule(p2, "SB11 (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break>");
73c04bcf
A
2635 break;
2636 }
2637
340931cb 2638 setAppliedRule(p2, "SB12 Any x Any");
73c04bcf
A
2639 continue;
2640 }
340931cb 2641
73c04bcf
A
2642 breakPos = p2;
2643 return breakPos;
2644}
2645
2646RBBISentMonkey::~RBBISentMonkey() {
2647 delete fSets;
2648 delete fSepSet;
2649 delete fFormatSet;
2650 delete fSpSet;
2651 delete fLowerSet;
2652 delete fUpperSet;
2653 delete fOLetterSet;
2654 delete fNumericSet;
2655 delete fATermSet;
46f4442e 2656 delete fSContinueSet;
73c04bcf
A
2657 delete fSTermSet;
2658 delete fCloseSet;
2659 delete fOtherSet;
2660 delete fExtendSet;
2661}
2662
2663
2664
2665//-------------------------------------------------------------------------------------------
2666//
2667// RBBILineMonkey
2668//
2669//-------------------------------------------------------------------------------------------
2670
2671class RBBILineMonkey: public RBBIMonkeyKind {
2672public:
2673 RBBILineMonkey();
2674 virtual ~RBBILineMonkey();
2675 virtual UVector *charClasses();
2676 virtual void setText(const UnicodeString &s);
2677 virtual int32_t next(int32_t i);
2678 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2679private:
2680 UVector *fSets;
2681
2682 UnicodeSet *fBK;
2683 UnicodeSet *fCR;
2684 UnicodeSet *fLF;
2685 UnicodeSet *fCM;
2686 UnicodeSet *fNL;
2687 UnicodeSet *fSG;
2688 UnicodeSet *fWJ;
2689 UnicodeSet *fZW;
2690 UnicodeSet *fGL;
2691 UnicodeSet *fCB;
2692 UnicodeSet *fSP;
2693 UnicodeSet *fB2;
2694 UnicodeSet *fBA;
2695 UnicodeSet *fBB;
3d1f044b 2696 UnicodeSet *fHH;
73c04bcf
A
2697 UnicodeSet *fHY;
2698 UnicodeSet *fH2;
2699 UnicodeSet *fH3;
2700 UnicodeSet *fCL;
729e4ab9 2701 UnicodeSet *fCP;
73c04bcf
A
2702 UnicodeSet *fEX;
2703 UnicodeSet *fIN;
2704 UnicodeSet *fJL;
2705 UnicodeSet *fJV;
2706 UnicodeSet *fJT;
2707 UnicodeSet *fNS;
2708 UnicodeSet *fOP;
2709 UnicodeSet *fQU;
2710 UnicodeSet *fIS;
2711 UnicodeSet *fNU;
2712 UnicodeSet *fPO;
2713 UnicodeSet *fPR;
2714 UnicodeSet *fSY;
2715 UnicodeSet *fAI;
2716 UnicodeSet *fAL;
4388f060
A
2717 UnicodeSet *fCJ;
2718 UnicodeSet *fHL;
73c04bcf 2719 UnicodeSet *fID;
51004dcb 2720 UnicodeSet *fRI;
73c04bcf 2721 UnicodeSet *fXX;
2ca993e8
A
2722 UnicodeSet *fEB;
2723 UnicodeSet *fEM;
3d1f044b 2724 UnicodeSet *fZWJ;
340931cb
A
2725 UnicodeSet *fOP30;
2726 UnicodeSet *fCP30;
73c04bcf 2727
57a6839d 2728 BreakIterator *fCharBI;
73c04bcf 2729 const UnicodeString *fText;
73c04bcf 2730 RegexMatcher *fNumberMatcher;
73c04bcf
A
2731};
2732
2ca993e8
A
2733RBBILineMonkey::RBBILineMonkey() :
2734 RBBIMonkeyKind(),
2735 fSets(NULL),
2736
2737 fCharBI(NULL),
2738 fText(NULL),
2739 fNumberMatcher(NULL)
73c04bcf 2740
73c04bcf 2741{
2ca993e8
A
2742 if (U_FAILURE(deferredStatus)) {
2743 return;
2744 }
2745
73c04bcf
A
2746 UErrorCode status = U_ZERO_ERROR;
2747
2748 fSets = new UVector(status);
2749
46f4442e
A
2750 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2751 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2752 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2753 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2754 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2755 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2756 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2757 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2758 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2759 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2760 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2761 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2762 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
3d1f044b 2763 fHH = new UnicodeSet();
46f4442e
A
2764 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2765 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2766 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
340931cb 2767 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=CL}] [\\u201D]]"), status); // en adjustments for rdar://problem/51193810
729e4ab9 2768 fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
46f4442e
A
2769 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2770 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2771 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2772 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2773 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2774 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
340931cb
A
2775 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=OP}] [\\u201C\\u2018]]"), status); // en adjustments
2776 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=QU}]-[\\u201C\\u2018\\u201D]]"), status); // en adjustments
46f4442e
A
2777 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2778 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2779 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2780 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2781 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2782 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2783 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
4388f060
A
2784 fCJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2785 fHL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
46f4442e 2786 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
51004dcb 2787 fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
46f4442e
A
2788 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2789 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
f3c0d7a5
A
2790 fEB = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=EB}] \\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F91D\\U0001F93C]"), status);
2791 fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
340931cb
A
2792 fZWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
2793 fOP30 = new UnicodeSet(u"[[\\p{Line_break=OP} [\\u201C\\u2018]]-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status); // en adjustments
2794 fCP30 = new UnicodeSet(u"[\\p{Line_break=CP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
73c04bcf
A
2795
2796 if (U_FAILURE(status)) {
2797 deferredStatus = status;
73c04bcf
A
2798 return;
2799 }
2800
2801 fAL->addAll(*fXX); // Default behavior for XX is identical to AL
2802 fAL->addAll(*fAI); // Default behavior for AI is identical to AL
73c04bcf
A
2803 fAL->addAll(*fSG); // Default behavior for SG is identical to AL.
2804
4388f060 2805 fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS.
3d1f044b
A
2806 fCM->addAll(*fZWJ); // ZWJ behaves as a CM.
2807
2808 fHH->add(u'\u2010'); // Hyphen, '‐'
2ca993e8 2809
340931cb
A
2810 // Sets and names.
2811 fSets->addElement(fBK, status); classNames.push_back("fBK");
2812 fSets->addElement(fCR, status); classNames.push_back("fCR");
2813 fSets->addElement(fLF, status); classNames.push_back("fLF");
2814 fSets->addElement(fCM, status); classNames.push_back("fCM");
2815 fSets->addElement(fNL, status); classNames.push_back("fNL");
2816 fSets->addElement(fWJ, status); classNames.push_back("fWJ");
2817 fSets->addElement(fZW, status); classNames.push_back("fZW");
2818 fSets->addElement(fGL, status); classNames.push_back("fGL");
2819 fSets->addElement(fCB, status); classNames.push_back("fCB");
2820 fSets->addElement(fSP, status); classNames.push_back("fSP");
2821 fSets->addElement(fB2, status); classNames.push_back("fB2");
2822 fSets->addElement(fBA, status); classNames.push_back("fBA");
2823 fSets->addElement(fBB, status); classNames.push_back("fBB");
2824 fSets->addElement(fHY, status); classNames.push_back("fHY");
2825 fSets->addElement(fH2, status); classNames.push_back("fH2");
2826 fSets->addElement(fH3, status); classNames.push_back("fH3");
2827 fSets->addElement(fCL, status); classNames.push_back("fCL");
2828 fSets->addElement(fCP, status); classNames.push_back("fCP");
2829 fSets->addElement(fEX, status); classNames.push_back("fEX");
2830 fSets->addElement(fIN, status); classNames.push_back("fIN");
2831 fSets->addElement(fJL, status); classNames.push_back("fJL");
2832 fSets->addElement(fJT, status); classNames.push_back("fJT");
2833 fSets->addElement(fJV, status); classNames.push_back("fJV");
2834 fSets->addElement(fNS, status); classNames.push_back("fNS");
2835 fSets->addElement(fOP, status); classNames.push_back("fOP");
2836 fSets->addElement(fQU, status); classNames.push_back("fQU");
2837 fSets->addElement(fIS, status); classNames.push_back("fIS");
2838 fSets->addElement(fNU, status); classNames.push_back("fNU");
2839 fSets->addElement(fPO, status); classNames.push_back("fPO");
2840 fSets->addElement(fPR, status); classNames.push_back("fPR");
2841 fSets->addElement(fSY, status); classNames.push_back("fSY");
2842 fSets->addElement(fAI, status); classNames.push_back("fAI");
2843 fSets->addElement(fAL, status); classNames.push_back("fAL");
2844 fSets->addElement(fHL, status); classNames.push_back("fHL");
2845 fSets->addElement(fID, status); classNames.push_back("fID");
2846 fSets->addElement(fWJ, status); classNames.push_back("fWJ");
2847 fSets->addElement(fRI, status); classNames.push_back("fRI");
2848 fSets->addElement(fSG, status); classNames.push_back("fSG");
2849 fSets->addElement(fEB, status); classNames.push_back("fEB");
2850 fSets->addElement(fEM, status); classNames.push_back("fEM");
2851 fSets->addElement(fZWJ, status); classNames.push_back("fZWJ");
2852 // TODO: fOP30 & fCP30 overlap with plain fOP. Probably OK, but fOP/CP chars will be over-represented.
2853 fSets->addElement(fOP30, status); classNames.push_back("fOP30");
2854 fSets->addElement(fCP30, status); classNames.push_back("fCP30");
73c04bcf 2855
2ca993e8 2856 const char *rules =
f3c0d7a5
A
2857 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
2858 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
3d1f044b 2859 "((\\p{Line_Break=IS})(\\p{Line_Break=CM}|\\u200d)*)?"
f3c0d7a5
A
2860 "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
2861 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
2862 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
2863 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
46f4442e 2864
73c04bcf 2865 fNumberMatcher = new RegexMatcher(
46f4442e 2866 UnicodeString(rules, -1, US_INV), 0, status);
73c04bcf
A
2867
2868 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2869
2870 if (U_FAILURE(status)) {
2871 deferredStatus = status;
2872 }
340931cb 2873
73c04bcf
A
2874}
2875
2876
2877void RBBILineMonkey::setText(const UnicodeString &s) {
2878 fText = &s;
2879 fCharBI->setText(s);
340931cb 2880 prepareAppliedRules(s.length());
73c04bcf
A
2881 fNumberMatcher->reset(s);
2882}
2883
2884//
2885// rule9Adjust
2886// Line Break TR rules 9 and 10 implementation.
2887// This deals with combining marks and other sequences that
2888// that must be treated as if they were something other than what they actually are.
2889//
2890// This is factored out into a separate function because it must be applied twice for
2891// each potential break, once to the chars before the position being checked, then
2892// again to the text following the possible break.
2893//
2894void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2895 if (pos == -1) {
2896 // Invalid initial position. Happens during the warmup iteration of the
2897 // main loop in next().
2898 return;
2899 }
2900
2901 int32_t nPos = *nextPos;
2902
2903 // LB 9 Keep combining sequences together.
340931cb
A
2904 // advance over any CM class chars. Note that Line Break CM is different
2905 // from the normal Grapheme Extend property.
73c04bcf
A
2906 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
2907 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
2908 for (;;) {
2909 *nextChar = fText->char32At(nPos);
2910 if (!fCM->contains(*nextChar)) {
2911 break;
2912 }
2913 nPos = fText->moveIndex32(nPos, 1);
2914 }
2915 }
2916
2917
2918 // LB 9 Treat X CM* as if it were x.
2919 // No explicit action required.
2920
2921 // LB 10 Treat any remaining combining mark as AL
2922 if (fCM->contains(*posChar)) {
f3c0d7a5 2923 *posChar = u'A';
73c04bcf
A
2924 }
2925
2926 // Push the updated nextPos and nextChar back to our caller.
2927 // This only makes a difference if posChar got bigger by consuming a
2928 // combining sequence.
2929 *nextPos = nPos;
2930 *nextChar = fText->char32At(nPos);
2931}
2932
2933
2934
2935int32_t RBBILineMonkey::next(int32_t startPos) {
2936 UErrorCode status = U_ZERO_ERROR;
2937 int32_t pos; // Index of the char following a potential break position
2938 UChar32 thisChar; // Character at above position "pos"
2939
2940 int32_t prevPos; // Index of the char preceding a potential break position
2941 UChar32 prevChar; // Character at above position. Note that prevChar
2942 // and thisChar may not be adjacent because combining
2943 // characters between them will be ignored.
2944
4388f060
A
2945 int32_t prevPosX2; // Second previous character. Wider context for LB21a.
2946 UChar32 prevCharX2;
2947
73c04bcf
A
2948 int32_t nextPos; // Index of the next character following pos.
2949 // Usually skips over combining marks.
2950 int32_t nextCPPos; // Index of the code point following "pos."
2951 // May point to a combining mark.
2952 int32_t tPos; // temp value.
2953 UChar32 c;
2954
46f4442e
A
2955 if (U_FAILURE(deferredStatus)) {
2956 return -1;
2957 }
2958
73c04bcf
A
2959 if (startPos >= fText->length()) {
2960 return -1;
2961 }
2962
2963
2964 // Initial values for loop. Loop will run the first time without finding breaks,
2965 // while the invalid values shift out and the "this" and
2966 // "prev" positions are filled in with good values.
4388f060
A
2967 pos = prevPos = prevPosX2 = -1; // Invalid value, serves as flag for initial loop iteration.
2968 thisChar = prevChar = prevCharX2 = 0;
73c04bcf
A
2969 nextPos = nextCPPos = startPos;
2970
2971
2972 // Loop runs once per position in the test text, until a break position
2973 // is found.
2974 for (;;) {
4388f060
A
2975 prevPosX2 = prevPos;
2976 prevCharX2 = prevChar;
2977
73c04bcf
A
2978 prevPos = pos;
2979 prevChar = thisChar;
2980
2981 pos = nextPos;
2982 thisChar = fText->char32At(pos);
2983
2984 nextCPPos = fText->moveIndex32(pos, 1);
2985 nextPos = nextCPPos;
2986
340931cb 2987
73c04bcf 2988 if (pos >= fText->length()) {
340931cb 2989 setAppliedRule(pos, "LB2 - Break at end of text.");
73c04bcf
A
2990 break;
2991 }
2992
340931cb 2993
73c04bcf
A
2994 // We do this one out-of-order because the adjustment does not change anything
2995 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
2996 // be applied.
340931cb 2997 rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
73c04bcf
A
2998 nextCPPos = nextPos = fText->moveIndex32(pos, 1);
2999 c = fText->char32At(nextPos);
340931cb 3000 rule9Adjust(pos, &thisChar, &nextPos, &c);
73c04bcf
A
3001
3002 // If the loop is still warming up - if we haven't shifted the initial
3003 // -1 positions out of prevPos yet - loop back to advance the
3004 // position in the input without any further looking for breaks.
3005 if (prevPos == -1) {
340931cb 3006 setAppliedRule(pos, "LB 9 - adjust for combining sequences.");
73c04bcf
A
3007 continue;
3008 }
46f4442e 3009
340931cb 3010
73c04bcf 3011 if (fBK->contains(prevChar)) {
340931cb 3012 setAppliedRule(pos, "LB 4 Always break after hard line breaks");
73c04bcf
A
3013 break;
3014 }
3015
340931cb 3016
73c04bcf 3017 if (prevChar == 0x0d && thisChar == 0x0a) {
340931cb 3018 setAppliedRule(pos, "LB 5 Break after CR, LF, NL, but not inside CR LF");
73c04bcf
A
3019 continue;
3020 }
3021 if (prevChar == 0x0d ||
3022 prevChar == 0x0a ||
3023 prevChar == 0x85) {
340931cb 3024 setAppliedRule(pos, "LB 5 Break after CR, LF, NL, but not inside CR LF");
73c04bcf
A
3025 break;
3026 }
3027
340931cb 3028
73c04bcf
A
3029 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3030 fBK->contains(thisChar)) {
340931cb
A
3031 setAppliedRule(pos, "LB 6 Don't break before hard line breaks");
3032 continue;
73c04bcf
A
3033 }
3034
3035
73c04bcf 3036 if (fSP->contains(thisChar)) {
340931cb 3037 setAppliedRule(pos, "LB 7 Don't break before spaces or zero-width space.");
73c04bcf
A
3038 continue;
3039 }
3040
340931cb 3041 // !!! ??? Is this the right text for the applied rule?
73c04bcf 3042 if (fZW->contains(thisChar)) {
340931cb 3043 setAppliedRule(pos, "LB 7 Don't break before spaces or zero-width space.");
73c04bcf
A
3044 continue;
3045 }
3046
340931cb 3047
3d1f044b
A
3048 // ZW SP* ÷
3049 // Scan backwards from prevChar for SP* ZW
3050 tPos = prevPos;
3051 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3052 tPos = fText->moveIndex32(tPos, -1);
3053 }
3054 if (fZW->contains(fText->char32At(tPos))) {
340931cb 3055 setAppliedRule(pos, "LB 8 Break after zero width space");
73c04bcf
A
3056 break;
3057 }
3058
340931cb 3059
0f5d89e8
A
3060 // Move this test up, before LB8a, because numbers can match a longer sequence that would
3061 // also match 8a. e.g. NU ZWJ IS PO (ZWJ acts like CM)
3062 if (fNumberMatcher->lookingAt(prevPos, status)) {
3063 if (U_FAILURE(status)) {
340931cb 3064 setAppliedRule(pos, "LB 25 Numbers");
0f5d89e8
A
3065 break;
3066 }
3067 // Matched a number. But could have been just a single digit, which would
3068 // not represent a "no break here" between prevChar and thisChar
3069 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num
3070 if (numEndIdx > pos) {
3071 // Number match includes at least our two chars being checked
3072 if (numEndIdx > nextPos) {
3073 // Number match includes additional chars. Update pos and nextPos
3074 // so that next loop iteration will continue at the end of the number,
3075 // checking for breaks between last char in number & whatever follows.
3076 pos = nextPos = numEndIdx;
3077 do {
3078 pos = fText->moveIndex32(pos, -1);
3079 thisChar = fText->char32At(pos);
3080 } while (fCM->contains(thisChar));
3081 }
340931cb 3082 setAppliedRule(pos, "LB 25 Numbers");
0f5d89e8
A
3083 continue;
3084 }
3085 }
3086
340931cb 3087
2ca993e8
A
3088 // The monkey test's way of ignoring combining characters doesn't work
3089 // for this rule. ZJ is also a CM. Need to get the actual character
3090 // preceding "thisChar", not ignoring combining marks, possibly ZJ.
3091 {
3092 int32_t prevIdx = fText->moveIndex32(pos, -1);
3093 UChar32 prevC = fText->char32At(prevIdx);
3d1f044b 3094 if (fZWJ->contains(prevC)) {
340931cb 3095 setAppliedRule(pos, "LB 8a ZWJ x");
2ca993e8
A
3096 continue;
3097 }
3098 }
3099
340931cb
A
3100
3101 // appliedRule: "LB 9, 10"; // Already done, at top of loop.";
73c04bcf
A
3102 //
3103
3104
73c04bcf
A
3105 // x WJ
3106 // WJ x
3107 //
3108 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
340931cb 3109 setAppliedRule(pos, "LB 11 Do not break before or after WORD JOINER and related characters.");
73c04bcf
A
3110 continue;
3111 }
3112
340931cb 3113
46f4442e 3114 if (fGL->contains(prevChar)) {
340931cb 3115 setAppliedRule(pos, "LB 12 GL x");
73c04bcf
A
3116 continue;
3117 }
2ca993e8 3118
340931cb
A
3119
3120 if (!(fSP->contains(prevChar) ||
46f4442e
A
3121 fBA->contains(prevChar) ||
3122 fHY->contains(prevChar) ) && fGL->contains(thisChar)) {
340931cb
A
3123 setAppliedRule(pos, "LB 12a [^SP BA HY] x GL");
3124 continue;
46f4442e
A
3125 }
3126
340931cb 3127
3d1f044b
A
3128 if (fCL->contains(thisChar) ||
3129 fCP->contains(thisChar) ||
3130 fEX->contains(thisChar) ||
3131 fSY->contains(thisChar)) {
340931cb 3132 setAppliedRule(pos, "LB 13 Don't break before closings.");
73c04bcf
A
3133 continue;
3134 }
3135
340931cb 3136
73c04bcf
A
3137 // Scan backwards, checking for this sequence.
3138 // The OP char could include combining marks, so we actually check for
3139 // OP CM* SP*
3d1f044b 3140 // Another Twist: The Rule 9 fixes may have changed a SP CM
73c04bcf
A
3141 // sequence into a ID char, so before scanning back through spaces,
3142 // verify that prevChar is indeed a space. The prevChar variable
3143 // may differ from fText[prevPos]
3144 tPos = prevPos;
3145 if (fSP->contains(prevChar)) {
3146 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3147 tPos=fText->moveIndex32(tPos, -1);
3148 }
3149 }
3150 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3151 tPos=fText->moveIndex32(tPos, -1);
3152 }
3153 if (fOP->contains(fText->char32At(tPos))) {
340931cb 3154 setAppliedRule(pos, "LB 14 Don't break after OP SP*");
73c04bcf
A
3155 continue;
3156 }
3157
3158
3d1f044b
A
3159 if (nextPos < fText->length()) {
3160 // note: UnicodeString::char32At(length) returns ffff, not distinguishable
3161 // from a legit ffff character. So test length separately.
3162 UChar32 nextChar = fText->char32At(nextPos);
3163 if (fSP->contains(prevChar) && fIS->contains(thisChar) && fNU->contains(nextChar)) {
340931cb 3164 setAppliedRule(pos, "LB 14a Break before an IS that begins a number and follows a space");
3d1f044b
A
3165 break;
3166 }
3167 }
3168
340931cb
A
3169
3170 if (fIS->contains(thisChar)) {
3171 setAppliedRule(pos, "LB 14b Do not break before numeric separators, even after spaces.");
3172 continue;
3d1f044b
A
3173 }
3174
340931cb 3175
73c04bcf
A
3176 if (fOP->contains(thisChar)) {
3177 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3178 int tPos = prevPos;
3179 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3180 tPos = fText->moveIndex32(tPos, -1);
3181 }
3182 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3183 tPos = fText->moveIndex32(tPos, -1);
3184 }
3185 if (fQU->contains(fText->char32At(tPos))) {
340931cb 3186 setAppliedRule(pos, "LB 15 QU SP* x OP");
73c04bcf
A
3187 continue;
3188 }
3189 }
3190
3191
729e4ab9 3192 // Scan backwards for SP* CM* (CL | CP)
73c04bcf
A
3193 if (fNS->contains(thisChar)) {
3194 int tPos = prevPos;
3195 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3196 tPos = fText->moveIndex32(tPos, -1);
3197 }
3198 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3199 tPos = fText->moveIndex32(tPos, -1);
3200 }
729e4ab9 3201 if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
340931cb 3202 setAppliedRule(pos, "LB 16 (CL | CP) SP* x NS");
73c04bcf
A
3203 continue;
3204 }
3205 }
3206
3207
73c04bcf
A
3208 if (fB2->contains(thisChar)) {
3209 // Scan backwards, checking for the B2 CM* SP* sequence.
3210 tPos = prevPos;
3211 if (fSP->contains(prevChar)) {
3212 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3213 tPos=fText->moveIndex32(tPos, -1);
3214 }
3215 }
3216 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3217 tPos=fText->moveIndex32(tPos, -1);
3218 }
3219 if (fB2->contains(fText->char32At(tPos))) {
340931cb 3220 setAppliedRule(pos, "LB 17 B2 SP* x B2");
73c04bcf
A
3221 continue;
3222 }
3223 }
3224
46f4442e 3225
73c04bcf 3226 if (fSP->contains(prevChar)) {
340931cb 3227 setAppliedRule(pos, "LB 18 break after space");
73c04bcf
A
3228 break;
3229 }
3230
73c04bcf
A
3231 // x QU
3232 // QU x
3233 if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
340931cb 3234 setAppliedRule(pos, "LB 19");
73c04bcf
A
3235 continue;
3236 }
3237
73c04bcf 3238 if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
340931cb 3239 setAppliedRule(pos, "LB 20 Break around a CB");
73c04bcf
A
3240 break;
3241 }
3242
340931cb 3243 // Don't break between Hyphens and letters if a break precedes the hyphen.
3d1f044b
A
3244 // Formerly this was a Finnish tailoring.
3245 // Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
340931cb 3246 // ^($HY | $HH) $AL;
3d1f044b
A
3247 if (fAL->contains(thisChar) && (fHY->contains(prevChar) || fHH->contains(prevChar)) &&
3248 prevPosX2 == -1) {
340931cb 3249 setAppliedRule(pos, "LB 20.09");
3d1f044b
A
3250 continue;
3251 }
3252
73c04bcf
A
3253 if (fBA->contains(thisChar) ||
3254 fHY->contains(thisChar) ||
3255 fNS->contains(thisChar) ||
3256 fBB->contains(prevChar) ) {
340931cb 3257 setAppliedRule(pos, "LB 21");
73c04bcf
A
3258 continue;
3259 }
3260
2ca993e8 3261 if (fHL->contains(prevCharX2) &&
4388f060 3262 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
340931cb 3263 setAppliedRule(pos, "LB 21a HL (HY | BA) x");
4388f060
A
3264 continue;
3265 }
3266
51004dcb 3267 if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
340931cb 3268 setAppliedRule(pos, "LB 21b SY x HL");
51004dcb
A
3269 continue;
3270 }
3271
340931cb
A
3272 if (fIN->contains(thisChar)) {
3273 setAppliedRule(pos, "LB 22");
73c04bcf
A
3274 continue;
3275 }
3276
3277
340931cb 3278 // (AL | HL) x NU
f3c0d7a5
A
3279 // NU x (AL | HL)
3280 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
340931cb 3281 setAppliedRule(pos, "LB 23");
f3c0d7a5
A
3282 continue;
3283 }
3284 if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
340931cb 3285 setAppliedRule(pos, "LB 23");
f3c0d7a5
A
3286 continue;
3287 }
3288
340931cb 3289 // Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
f3c0d7a5
A
3290 // PR x (ID | EB | EM)
3291 // (ID | EB | EM) x PO
0f5d89e8 3292 if (fPR->contains(prevChar) &&
f3c0d7a5 3293 (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar))) {
340931cb 3294 setAppliedRule(pos, "LB 23a");
f3c0d7a5
A
3295 continue;
3296 }
0f5d89e8 3297 if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
f3c0d7a5 3298 fPO->contains(thisChar)) {
340931cb 3299 setAppliedRule(pos, "LB 23a");
73c04bcf
A
3300 continue;
3301 }
3302
340931cb 3303 // Do not break between prefix and letters or ideographs.
f3c0d7a5
A
3304 // (PR | PO) x (AL | HL)
3305 // (AL | HL) x (PR | PO)
3306 if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
3307 (fAL->contains(thisChar) || fHL->contains(thisChar))) {
340931cb 3308 setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
f3c0d7a5
A
3309 continue;
3310 }
3311 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
3312 (fPR->contains(thisChar) || fPO->contains(thisChar))) {
340931cb 3313 setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
73c04bcf
A
3314 continue;
3315 }
46f4442e 3316
340931cb 3317 // appliedRule: "LB 25 numbers match"; // moved up, before LB 8a,
73c04bcf 3318
73c04bcf
A
3319 if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3320 fJV->contains(thisChar) ||
3321 fH2->contains(thisChar) ||
3322 fH3->contains(thisChar))) {
340931cb
A
3323 setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3324 continue;
73c04bcf
A
3325 }
3326
3327 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) &&
3328 (fJV->contains(thisChar) || fJT->contains(thisChar))) {
340931cb
A
3329 setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3330 continue;
73c04bcf
A
3331 }
3332
3333 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3334 fJT->contains(thisChar)) {
340931cb
A
3335 setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3336 continue;
73c04bcf
A
3337 }
3338
73c04bcf
A
3339 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3340 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3341 fIN->contains(thisChar)) {
340931cb
A
3342 setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3343 continue;
73c04bcf
A
3344 }
3345 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3346 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3347 fPO->contains(thisChar)) {
340931cb
A
3348 setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3349 continue;
73c04bcf
A
3350 }
3351 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3352 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
340931cb
A
3353 setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3354 continue;
73c04bcf
A
3355 }
3356
3357
3358
4388f060 3359 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
340931cb 3360 setAppliedRule(pos, "LB 28 Do not break between alphabetics (\"at\").");
73c04bcf
A
3361 continue;
3362 }
3363
340931cb
A
3364 if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3365 setAppliedRule(pos, "LB 29 Do not break between numeric punctuation and alphabetics (\"e.g.\").");
3366 continue;
73c04bcf
A
3367 }
3368
729e4ab9
A
3369 // (AL | NU) x OP
3370 // CP x (AL | NU)
340931cb
A
3371 if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP30->contains(thisChar)) {
3372 setAppliedRule(pos, "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
729e4ab9
A
3373 continue;
3374 }
340931cb
A
3375 if (fCP30->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3376 setAppliedRule(pos, "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
729e4ab9
A
3377 continue;
3378 }
3379
3d1f044b 3380 // RI x RI
2ca993e8 3381 if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
340931cb 3382 setAppliedRule(pos, "LB30a RI RI ÷ RI");
2ca993e8
A
3383 break;
3384 }
51004dcb 3385 if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3d1f044b
A
3386 // Two Regional Indicators have been paired.
3387 // Over-write the trailing one (thisChar) to prevent it from forming another pair with a
3388 // following RI. This is a hack.
3389 thisChar = -1;
340931cb 3390 setAppliedRule(pos, "LB30a RI RI ÷ RI");
51004dcb
A
3391 continue;
3392 }
3393
2ca993e8 3394 if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
340931cb 3395 setAppliedRule(pos, "LB30b Emoji Base x Emoji Modifier");
2ca993e8
A
3396 continue;
3397 }
3398
340931cb 3399 setAppliedRule(pos, "LB 31 Break everywhere else");
73c04bcf 3400 break;
73c04bcf
A
3401 }
3402
3403 return pos;
3404}
3405
3406
3407UVector *RBBILineMonkey::charClasses() {
3408 return fSets;
3409}
3410
3411
3412RBBILineMonkey::~RBBILineMonkey() {
3413 delete fSets;
3414
3415 delete fBK;
3416 delete fCR;
3417 delete fLF;
3418 delete fCM;
3419 delete fNL;
3420 delete fWJ;
3421 delete fZW;
3422 delete fGL;
3423 delete fCB;
3424 delete fSP;
3425 delete fB2;
3426 delete fBA;
3427 delete fBB;
3d1f044b 3428 delete fHH;
73c04bcf
A
3429 delete fHY;
3430 delete fH2;
3431 delete fH3;
3432 delete fCL;
729e4ab9 3433 delete fCP;
73c04bcf
A
3434 delete fEX;
3435 delete fIN;
3436 delete fJL;
3437 delete fJV;
3438 delete fJT;
3439 delete fNS;
3440 delete fOP;
3441 delete fQU;
3442 delete fIS;
3443 delete fNU;
3444 delete fPO;
3445 delete fPR;
3446 delete fSY;
3447 delete fAI;
3448 delete fAL;
4388f060
A
3449 delete fCJ;
3450 delete fHL;
73c04bcf 3451 delete fID;
51004dcb 3452 delete fRI;
73c04bcf
A
3453 delete fSG;
3454 delete fXX;
2ca993e8
A
3455 delete fEB;
3456 delete fEM;
3d1f044b 3457 delete fZWJ;
340931cb
A
3458 delete fOP30;
3459 delete fCP30;
73c04bcf
A
3460
3461 delete fCharBI;
3462 delete fNumberMatcher;
3463}
3464
3465
3466//-------------------------------------------------------------------------------------------
3467//
3468// TestMonkey
3469//
3470// params
3471// seed=nnnnn Random number starting seed.
3472// Setting the seed allows errors to be reproduced.
3473// loop=nnn Looping count. Controls running time.
3474// -1: run forever.
3475// 0 or greater: run length.
3476//
3477// type = char | word | line | sent | title
3478//
2ca993e8
A
3479// Example:
3480// intltest rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3481//
73c04bcf
A
3482//-------------------------------------------------------------------------------------------
3483
3484static int32_t getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3485 int32_t val = defaultVal;
3486 name.append(" *= *(-?\\d+)");
3487 UErrorCode status = U_ZERO_ERROR;
3488 RegexMatcher m(name, params, 0, status);
3489 if (m.find()) {
3490 // The param exists. Convert the string to an int.
3491 char valString[100];
3492 int32_t paramLength = m.end(1, status) - m.start(1, status);
3493 if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3494 paramLength = (int32_t)(sizeof(valString)-2);
3495 }
3496 params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
340931cb 3497 val = strtol(valString, NULL, 10);
73c04bcf
A
3498
3499 // Delete this parameter from the params string.
3500 m.reset();
3501 params = m.replaceFirst("", status);
3502 }
3503 U_ASSERT(U_SUCCESS(status));
3504 return val;
3505}
3506#endif
3507
51004dcb 3508#if !UCONFIG_NO_REGULAR_EXPRESSIONS
73c04bcf
A
3509static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3510 BreakIterator *bi,
3511 int expected[],
3512 int expectedcount)
3513{
3514 int count = 0;
3515 int i = 0;
3516 int forward[50];
3517 bi->setText(ustr);
3518 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3519 forward[count] = i;
3520 if (count < expectedcount && expected[count] != i) {
0f5d89e8
A
3521 test->errln("%s:%d break forward test failed: expected %d but got %d",
3522 __FILE__, __LINE__, expected[count], i);
73c04bcf
A
3523 break;
3524 }
3525 count ++;
3526 }
3527 if (count != expectedcount) {
3528 printStringBreaks(ustr, expected, expectedcount);
0f5d89e8
A
3529 test->errln("%s:%d break forward test failed: missed %d match",
3530 __FILE__, __LINE__, expectedcount - count);
73c04bcf
A
3531 return;
3532 }
3533 // testing boundaries
3534 for (i = 1; i < expectedcount; i ++) {
3535 int j = expected[i - 1];
3536 if (!bi->isBoundary(j)) {
3537 printStringBreaks(ustr, expected, expectedcount);
0f5d89e8
A
3538 test->errln("%s:%d isBoundary() failed. Expected boundary at position %d",
3539 __FILE__, __LINE__, j);
73c04bcf
A
3540 return;
3541 }
3542 for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3543 if (bi->isBoundary(j)) {
3544 printStringBreaks(ustr, expected, expectedcount);
0f5d89e8
A
3545 test->errln("%s:%d isBoundary() failed. Not expecting boundary at position %d",
3546 __FILE__, __LINE__, j);
73c04bcf
A
3547 return;
3548 }
3549 }
3550 }
3551
3552 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3553 count --;
3554 if (forward[count] != i) {
51004dcb 3555 printStringBreaks(ustr, expected, expectedcount);
0f5d89e8
A
3556 test->errln("%s:%d happy break test previous() failed: expected %d but got %d",
3557 __FILE__, __LINE__, forward[count], i);
73c04bcf
A
3558 break;
3559 }
3560 }
3561 if (count != 0) {
3562 printStringBreaks(ustr, expected, expectedcount);
3563 test->errln("break test previous() failed: missed a match");
3564 return;
3565 }
3566
3567 // testing preceding
3568 for (i = 0; i < expectedcount - 1; i ++) {
3569 // int j = expected[i] + 1;
3570 int j = ustr.moveIndex32(expected[i], 1);
3571 for (; j <= expected[i + 1]; j ++) {
0f5d89e8
A
3572 int32_t expectedPreceding = expected[i];
3573 int32_t actualPreceding = bi->preceding(j);
3574 if (actualPreceding != expectedPreceding) {
73c04bcf 3575 printStringBreaks(ustr, expected, expectedcount);
0f5d89e8
A
3576 test->errln("%s:%d preceding(%d): expected %d, got %d",
3577 __FILE__, __LINE__, j, expectedPreceding, actualPreceding);
73c04bcf
A
3578 return;
3579 }
3580 }
3581 }
3582}
51004dcb 3583#endif
73c04bcf
A
3584
3585void RBBITest::TestWordBreaks(void)
3586{
3587#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3588
73c04bcf
A
3589 Locale locale("en");
3590 UErrorCode status = U_ZERO_ERROR;
3591 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3592 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
51004dcb
A
3593 // Replaced any C+J characters in a row with a random sequence of characters
3594 // of the same length to make our C+J segmentation not get in the way.
73c04bcf
A
3595 static const char *strlist[] =
3596 {
3597 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
51004dcb 3598 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
46f4442e 3599 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
73c04bcf 3600 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
51004dcb 3601 "\\uac00\\u3588\\u009c\\u0953\\u194b",
73c04bcf
A
3602 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3603 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
51004dcb 3604 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
73c04bcf
A
3605 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3606 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3607 "\\u2027\\U000e0067\\u0a47\\u00b7",
3608 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3609 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3610 "\\u0589\\U000e006e\\u0a42\\U000104a5",
51004dcb 3611 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
73c04bcf
A
3612 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3613 "\\u0027\\u11af\\U000e0057\\u0602",
3614 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3615 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3616 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3617 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
46f4442e 3618 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
73c04bcf
A
3619 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3620 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3621 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3622 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
51004dcb 3623 "\\u18f4\\U000e0049\\u20e7\\u2027",
73c04bcf
A
3624 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3625 "\\ua183\\u102d\\u0bec\\u003a",
3626 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3627 "\\u003a\\u0e57\\u0fad\\u002e",
3628 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3629 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3630 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3631 "\\u003a\\u0664\\u00b7\\u1fba",
3632 "\\u003b\\u0027\\u00b7\\u47a3",
51004dcb 3633 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
73c04bcf
A
3634 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3635 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3636 };
3637 int loop;
3638 if (U_FAILURE(status)) {
729e4ab9 3639 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
73c04bcf
A
3640 return;
3641 }
2ca993e8 3642 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
73c04bcf 3643 // printf("looping %d\n", loop);
46f4442e 3644 UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
73c04bcf
A
3645 // RBBICharMonkey monkey;
3646 RBBIWordMonkey monkey;
3647
3648 int expected[50];
3649 int expectedcount = 0;
3650
3651 monkey.setText(ustr);
3652 int i;
3653 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3654 expected[expectedcount ++] = i;
3655 }
3656
3657 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3658 }
3659 delete bi;
3660#endif
3661}
3662
3663void RBBITest::TestWordBoundary(void)
3664{
3665 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3666 Locale locale("en");
3667 UErrorCode status = U_ZERO_ERROR;
3668 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
0f5d89e8
A
3669 LocalPointer<BreakIterator> bi(BreakIterator::createWordInstance(locale, status), status);
3670 if (U_FAILURE(status)) {
3671 errcheckln(status, "%s:%d Creation of break iterator failed %s",
3672 __FILE__, __LINE__, u_errorName(status));
3673 return;
3674 }
73c04bcf
A
3675 UChar str[50];
3676 static const char *strlist[] =
3677 {
3678 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3679 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3680 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3681 "\\u2027\\U000e0067\\u0a47\\u00b7",
3682 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3683 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3684 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3685 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3686 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3687 "\\u0027\\u11af\\U000e0057\\u0602",
3688 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3689 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3690 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3691 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3692 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
51004dcb 3693 "\\U000e0065\\u302c\\u09ee\\U000e0068",
73c04bcf
A
3694 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3695 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3696 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3697 "\\u58f4\\U000e0049\\u20e7\\u2027",
51004dcb 3698 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
73c04bcf
A
3699 "\\ua183\\u102d\\u0bec\\u003a",
3700 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3701 "\\u003a\\u0e57\\u0fad\\u002e",
3702 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3703 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3704 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3705 "\\u003a\\u0664\\u00b7\\u1fba",
3706 "\\u003b\\u0027\\u00b7\\u47a3",
3707 };
3708 int loop;
2ca993e8 3709 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
0f5d89e8 3710 u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
73c04bcf
A
3711 UnicodeString ustr(str);
3712 int forward[50];
3713 int count = 0;
3714
3715 bi->setText(ustr);
0f5d89e8
A
3716 int prev = -1;
3717 for (int32_t boundary = bi->first(); boundary != BreakIterator::DONE; boundary = bi->next()) {
3718 ++count;
3719 if (count >= UPRV_LENGTHOF(forward)) {
3720 errln("%s:%d too many breaks found. (loop, count, boundary) = (%d, %d, %d)",
3721 __FILE__, __LINE__, loop, count, boundary);
3722 return;
3723 }
3724 forward[count] = boundary;
3725 if (boundary <= prev) {
3726 errln("%s:%d bi::next() did not advance. (loop, prev, boundary) = (%d, %d, %d)\n",
3727 __FILE__, __LINE__, loop, prev, boundary);
3728 break;
3729 }
3730 for (int32_t nonBoundary = prev + 1; nonBoundary < boundary; nonBoundary ++) {
3731 if (bi->isBoundary(nonBoundary)) {
3732 printStringBreaks(ustr, forward, count);
3733 errln("%s:%d isBoundary(nonBoundary) failed. (loop, prev, nonBoundary, boundary) = (%d, %d, %d, %d)",
3734 __FILE__, __LINE__, loop, prev, nonBoundary, boundary);
3735 return;
73c04bcf
A
3736 }
3737 }
0f5d89e8 3738 if (!bi->isBoundary(boundary)) {
73c04bcf 3739 printStringBreaks(ustr, forward, count);
0f5d89e8
A
3740 errln("%s:%d happy boundary test failed: expected %d a boundary",
3741 __FILE__, __LINE__, boundary);
73c04bcf
A
3742 return;
3743 }
0f5d89e8 3744 prev = boundary;
73c04bcf
A
3745 }
3746 }
73c04bcf
A
3747}
3748
3749void RBBITest::TestLineBreaks(void)
3750{
3751#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3752 Locale locale("en");
3753 UErrorCode status = U_ZERO_ERROR;
3754 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3755 const int32_t STRSIZE = 50;
3756 UChar str[STRSIZE];
3757 static const char *strlist[] =
3758 {
3759 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3760 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3761 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3762 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3763 "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3764 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3765 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3766 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3767 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3768 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
73c04bcf
A
3769 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3770 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3771 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3772 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3773 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3774 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3775 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3776 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3777 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3778 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3779 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3780 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3781 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3782 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3783 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
73c04bcf
A
3784 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3785 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3786 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3787 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3788 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
73c04bcf
A
3789 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3790 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
73c04bcf
A
3791 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3792 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3793 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3794 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3795 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3796 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
73c04bcf
A
3797 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3798 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3799 };
3800 int loop;
3801 TEST_ASSERT_SUCCESS(status);
3802 if (U_FAILURE(status)) {
3803 return;
3804 }
2ca993e8 3805 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
73c04bcf
A
3806 // printf("looping %d\n", loop);
3807 int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3808 if (t >= STRSIZE) {
3809 TEST_ASSERT(FALSE);
3810 continue;
3811 }
3812
46f4442e 3813
73c04bcf
A
3814 UnicodeString ustr(str);
3815 RBBILineMonkey monkey;
3816 if (U_FAILURE(monkey.deferredStatus)) {
3817 continue;
3818 }
3819
3820 const int EXPECTEDSIZE = 50;
3821 int expected[EXPECTEDSIZE];
3822 int expectedcount = 0;
3823
3824 monkey.setText(ustr);
340931cb 3825
73c04bcf
A
3826 int i;
3827 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3828 if (expectedcount >= EXPECTEDSIZE) {
3829 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3830 return;
3831 }
3832 expected[expectedcount ++] = i;
3833 }
3834
3835 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3836 }
3837 delete bi;
3838#endif
3839}
3840
3841void RBBITest::TestSentBreaks(void)
3842{
3843#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3844 Locale locale("en");
3845 UErrorCode status = U_ZERO_ERROR;
3846 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3847 UChar str[200];
3848 static const char *strlist[] =
3849 {
3850 "Now\ris\nthe\r\ntime\n\rfor\r\r",
3851 "This\n",
3852 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3853 "\"Sentence ending with a quote.\" Bye.",
3854 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
3855 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3856 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3857 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3858 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3859 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3860 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3861 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3862 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3863 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3864 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3865 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3866 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3867 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3868 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3869 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3870 };
3871 int loop;
3872 if (U_FAILURE(status)) {
729e4ab9 3873 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
73c04bcf
A
3874 return;
3875 }
2ca993e8
A
3876 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3877 u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
73c04bcf
A
3878 UnicodeString ustr(str);
3879
3880 RBBISentMonkey monkey;
3881 if (U_FAILURE(monkey.deferredStatus)) {
3882 continue;
3883 }
3884
3885 const int EXPECTEDSIZE = 50;
3886 int expected[EXPECTEDSIZE];
3887 int expectedcount = 0;
3888
3889 monkey.setText(ustr);
340931cb 3890
73c04bcf
A
3891 int i;
3892 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3893 if (expectedcount >= EXPECTEDSIZE) {
3894 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3895 return;
3896 }
3897 expected[expectedcount ++] = i;
3898 }
3899
3900 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3901 }
3902 delete bi;
3903#endif
3904}
3905
f3c0d7a5 3906void RBBITest::TestMonkey() {
73c04bcf
A
3907#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3908
3909 UErrorCode status = U_ZERO_ERROR;
3910 int32_t loopCount = 500;
3911 int32_t seed = 1;
3912 UnicodeString breakType = "all";
3913 Locale locale("en");
3914 UBool useUText = FALSE;
3915
3916 if (quick == FALSE) {
3917 loopCount = 10000;
3918 }
3919
f3c0d7a5
A
3920 if (fTestParams) {
3921 UnicodeString p(fTestParams);
73c04bcf
A
3922 loopCount = getIntParam("loop", p, loopCount);
3923 seed = getIntParam("seed", p, seed);
3924
3925 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3926 if (m.find()) {
3927 breakType = m.group(1, status);
3928 m.reset();
3929 p = m.replaceFirst("", status);
3930 }
3931
3932 RegexMatcher u(" *utext", p, 0, status);
3933 if (u.find()) {
3934 useUText = TRUE;
3935 u.reset();
3936 p = u.replaceFirst("", status);
3937 }
3938
3939
3940 // m.reset(p);
46f4442e 3941 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
73c04bcf
A
3942 // Each option is stripped out of the option string as it is processed.
3943 // All options have been checked. The option string should have been completely emptied..
3944 char buf[100];
3945 p.extract(buf, sizeof(buf), NULL, status);
3946 buf[sizeof(buf)-1] = 0;
3947 errln("Unrecognized or extra parameter: %s\n", buf);
3948 return;
3949 }
3950
3951 }
3952
3953 if (breakType == "char" || breakType == "all") {
3954 RBBICharMonkey m;
3955 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3956 if (U_SUCCESS(status)) {
3957 RunMonkey(bi, m, "char", seed, loopCount, useUText);
3958 if (breakType == "all" && useUText==FALSE) {
3959 // Also run a quick test with UText when "all" is specified
3960 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
3961 }
3962 }
3963 else {
729e4ab9 3964 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
73c04bcf
A
3965 }
3966 delete bi;
3967 }
3968
3969 if (breakType == "word" || breakType == "all") {
3970 logln("Word Break Monkey Test");
3971 RBBIWordMonkey m;
3972 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3973 if (U_SUCCESS(status)) {
3974 RunMonkey(bi, m, "word", seed, loopCount, useUText);
3975 }
3976 else {
729e4ab9 3977 errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
73c04bcf
A
3978 }
3979 delete bi;
3980 }
3981
3982 if (breakType == "line" || breakType == "all") {
3983 logln("Line Break Monkey Test");
3984 RBBILineMonkey m;
3985 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3986 if (loopCount >= 10) {
3987 loopCount = loopCount / 5; // Line break runs slower than the others.
3988 }
3989 if (U_SUCCESS(status)) {
3990 RunMonkey(bi, m, "line", seed, loopCount, useUText);
3991 }
3992 else {
729e4ab9 3993 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
73c04bcf
A
3994 }
3995 delete bi;
3996 }
3997
46f4442e 3998 if (breakType == "sent" || breakType == "all" ) {
73c04bcf
A
3999 logln("Sentence Break Monkey Test");
4000 RBBISentMonkey m;
4001 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
4002 if (loopCount >= 10) {
4003 loopCount = loopCount / 10; // Sentence runs slower than the other break types
4004 }
4005 if (U_SUCCESS(status)) {
340931cb 4006 RunMonkey(bi, m, "sent", seed, loopCount, useUText);
73c04bcf
A
4007 }
4008 else {
729e4ab9 4009 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
73c04bcf
A
4010 }
4011 delete bi;
4012 }
4013
4014#endif
4015}
4016
4017//
4018// Run a RBBI monkey test. Common routine, for all break iterator types.
4019// Parameters:
4020// bi - the break iterator to use
4021// mk - MonkeyKind, abstraction for obtaining expected results
4022// name - Name of test (char, word, etc.) for use in error messages
4023// seed - Seed for starting random number generator (parameter from user)
4024// numIterations
4025//
46f4442e 4026void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed,
73c04bcf
A
4027 int32_t numIterations, UBool useUText) {
4028
4029#if !UCONFIG_NO_REGULAR_EXPRESSIONS
4030
4031 const int32_t TESTSTRINGLEN = 500;
4032 UnicodeString testText;
4033 int32_t numCharClasses;
4034 UVector *chClasses;
73c04bcf
A
4035 int expectedCount = 0;
4036 char expectedBreaks[TESTSTRINGLEN*2 + 1];
4037 char forwardBreaks[TESTSTRINGLEN*2 + 1];
4038 char reverseBreaks[TESTSTRINGLEN*2+1];
4039 char isBoundaryBreaks[TESTSTRINGLEN*2+1];
4040 char followingBreaks[TESTSTRINGLEN*2+1];
4041 char precedingBreaks[TESTSTRINGLEN*2+1];
4042 int i;
4043 int loopCount = 0;
4044
340931cb 4045
73c04bcf
A
4046 m_seed = seed;
4047
4048 numCharClasses = mk.charClasses()->size();
4049 chClasses = mk.charClasses();
4050
4051 // Check for errors that occured during the construction of the MonkeyKind object.
4052 // Can't report them where they occured because errln() is a method coming from intlTest,
4053 // and is not visible outside of RBBITest :-(
4054 if (U_FAILURE(mk.deferredStatus)) {
4055 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
4056 return;
4057 }
4058
4059 // Verify that the character classes all have at least one member.
4060 for (i=0; i<numCharClasses; i++) {
4061 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
4062 if (s == NULL || s->size() == 0) {
4063 errln("Character Class #%d is null or of zero size.", i);
4064 return;
4065 }
4066 }
4067
340931cb
A
4068 // For minimizing width of class name output.
4069 int classNameSize = mk.maxClassNameSize();
4070
73c04bcf
A
4071 while (loopCount < numIterations || numIterations == -1) {
4072 if (numIterations == -1 && loopCount % 10 == 0) {
4073 // If test is running in an infinite loop, display a periodic tic so
4074 // we can tell that it is making progress.
4075 fprintf(stderr, ".");
4076 }
4077 // Save current random number seed, so that we can recreate the random numbers
4078 // for this loop iteration in event of an error.
4079 seed = m_seed;
4080
4081 // Populate a test string with data.
4082 testText.truncate(0);
4083 for (i=0; i<TESTSTRINGLEN; i++) {
4084 int32_t aClassNum = m_rand() % numCharClasses;
4085 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4086 int32_t charIdx = m_rand() % classSet->size();
4087 UChar32 c = classSet->charAt(charIdx);
4088 if (c < 0) { // TODO: deal with sets containing strings.
2ca993e8 4089 errln("%s:%d c < 0", __FILE__, __LINE__);
73c04bcf
A
4090 break;
4091 }
2ca993e8
A
4092 // Do not assemble a supplementary character from randomly generated separate surrogates.
4093 // (It could be a dictionary character)
4094 if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
4095 continue;
4096 }
4097
73c04bcf
A
4098 testText.append(c);
4099 }
4100
340931cb 4101 // Calculate the expected results for this test string and reset applied rules.
73c04bcf 4102 mk.setText(testText);
340931cb 4103
73c04bcf
A
4104 memset(expectedBreaks, 0, sizeof(expectedBreaks));
4105 expectedBreaks[0] = 1;
4106 int32_t breakPos = 0;
4107 expectedCount = 0;
4108 for (;;) {
4109 breakPos = mk.next(breakPos);
4110 if (breakPos == -1) {
4111 break;
4112 }
4113 if (breakPos > testText.length()) {
4114 errln("breakPos > testText.length()");
4115 }
4116 expectedBreaks[breakPos] = 1;
4117 U_ASSERT(expectedCount<testText.length());
73c04bcf
A
4118 }
4119
4120 // Find the break positions using forward iteration
4121 memset(forwardBreaks, 0, sizeof(forwardBreaks));
4122 if (useUText) {
4123 UErrorCode status = U_ZERO_ERROR;
4124 UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4125 // testUText = utext_openUnicodeString(testUText, &testText, &status);
4126 bi->setText(testUText, status);
4127 TEST_ASSERT_SUCCESS(status);
4128 utext_close(testUText); // The break iterator does a shallow clone of the UText
4129 // This UText can be closed immediately, so long as the
4130 // testText string continues to exist.
4131 } else {
4132 bi->setText(testText);
4133 }
4134
4135 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4136 if (i < 0 || i > testText.length()) {
4137 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4138 break;
4139 }
4140 forwardBreaks[i] = 1;
4141 }
4142
4143 // Find the break positions using reverse iteration
4144 memset(reverseBreaks, 0, sizeof(reverseBreaks));
4145 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4146 if (i < 0 || i > testText.length()) {
4147 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4148 break;
4149 }
4150 reverseBreaks[i] = 1;
4151 }
4152
4153 // Find the break positions using isBoundary() tests.
4154 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4155 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4156 for (i=0; i<=testText.length(); i++) {
4157 isBoundaryBreaks[i] = bi->isBoundary(i);
4158 }
4159
4160
4161 // Find the break positions using the following() function.
4162 // printf(".");
4163 memset(followingBreaks, 0, sizeof(followingBreaks));
4164 int32_t lastBreakPos = 0;
4165 followingBreaks[0] = 1;
4166 for (i=0; i<testText.length(); i++) {
4167 breakPos = bi->following(i);
4168 if (breakPos <= i ||
4169 breakPos < lastBreakPos ||
4170 breakPos > testText.length() ||
729e4ab9 4171 (breakPos > lastBreakPos && lastBreakPos > i)) {
73c04bcf
A
4172 errln("%s break monkey test: "
4173 "Out of range value returned by BreakIterator::following().\n"
4174 "Random seed=%d index=%d; following returned %d; lastbreak=%d",
4175 name, seed, i, breakPos, lastBreakPos);
4176 break;
4177 }
4178 followingBreaks[breakPos] = 1;
4179 lastBreakPos = breakPos;
4180 }
4181
4182 // Find the break positions using the preceding() function.
46f4442e 4183 memset(precedingBreaks, 0, sizeof(precedingBreaks));
73c04bcf
A
4184 lastBreakPos = testText.length();
4185 precedingBreaks[testText.length()] = 1;
4186 for (i=testText.length(); i>0; i--) {
4187 breakPos = bi->preceding(i);
4188 if (breakPos >= i ||
4189 breakPos > lastBreakPos ||
729e4ab9
A
4190 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4191 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
73c04bcf
A
4192 errln("%s break monkey test: "
4193 "Out of range value returned by BreakIterator::preceding().\n"
4194 "index=%d; prev returned %d; lastBreak=%d" ,
4195 name, i, breakPos, lastBreakPos);
46f4442e
A
4196 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4197 precedingBreaks[i] = 2; // Forces an error.
4198 }
73c04bcf 4199 } else {
46f4442e
A
4200 if (breakPos >= 0) {
4201 precedingBreaks[breakPos] = 1;
2ca993e8 4202 }
73c04bcf
A
4203 lastBreakPos = breakPos;
4204 }
4205 }
4206
4207 // Compare the expected and actual results.
4208 for (i=0; i<=testText.length(); i++) {
4209 const char *errorType = NULL;
340931cb 4210 const char* currentBreakData = NULL;
73c04bcf
A
4211 if (forwardBreaks[i] != expectedBreaks[i]) {
4212 errorType = "next()";
340931cb 4213 currentBreakData = forwardBreaks;
73c04bcf
A
4214 } else if (reverseBreaks[i] != forwardBreaks[i]) {
4215 errorType = "previous()";
340931cb
A
4216 currentBreakData = reverseBreaks;
4217 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
73c04bcf 4218 errorType = "isBoundary()";
340931cb 4219 currentBreakData = isBoundaryBreaks;
73c04bcf
A
4220 } else if (followingBreaks[i] != expectedBreaks[i]) {
4221 errorType = "following()";
340931cb 4222 currentBreakData = followingBreaks;
73c04bcf
A
4223 } else if (precedingBreaks[i] != expectedBreaks[i]) {
4224 errorType = "preceding()";
340931cb 4225 currentBreakData = precedingBreaks;
73c04bcf
A
4226 }
4227
73c04bcf
A
4228 if (errorType != NULL) {
4229 // Format a range of the test text that includes the failure as
4230 // a data item that can be included in the rbbi test data file.
4231
4232 // Start of the range is the last point where expected and actual results
340931cb
A
4233 // both agreed that there was a break position.
4234
73c04bcf
A
4235 int startContext = i;
4236 int32_t count = 0;
4237 for (;;) {
4238 if (startContext==0) { break; }
4239 startContext --;
4240 if (expectedBreaks[startContext] != 0) {
4241 if (count == 2) break;
4242 count ++;
4243 }
4244 }
4245
4246 // End of range is two expected breaks past the start position.
4247 int endContext = i + 1;
4248 int ci;
4249 for (ci=0; ci<2; ci++) { // Number of items to include in error text.
4250 for (;;) {
4251 if (endContext >= testText.length()) {break;}
4252 if (expectedBreaks[endContext-1] != 0) {
4253 if (count == 0) break;
4254 count --;
4255 }
4256 endContext ++;
4257 }
4258 }
4259
340931cb
A
4260 // Formatting of each line includes:
4261 // character code
4262 // reference break: '|' -> a break, '.' -> no break
4263 // actual break: '|' -> a break, '.' -> no break
4264 // (name of character clase)
4265 // Unicode name of character
4266 // '-->' indicates location of the difference.
73c04bcf 4267
340931cb
A
4268 MONKEY_ERROR(
4269 (expectedBreaks[i] ? "Break expected but not found" :
4270 "Break found but not expected"),
4271 name, i, seed);
73c04bcf 4272
340931cb 4273 for (ci=startContext; (ci = testText.moveIndex32(ci, 1));) {
73c04bcf 4274 UChar32 c;
73c04bcf 4275 c = testText.char32At(ci);
340931cb
A
4276
4277 std::string currentLineFlag = " ";
73c04bcf 4278 if (ci == i) {
340931cb 4279 currentLineFlag = "-->"; // Error position
73c04bcf 4280 }
340931cb
A
4281
4282 // BMP or SMP character in hex
4283 char hexCodePoint[12];
4284 std::string format = " \\u%04x";
4285 if (c >= 0x10000) {
4286 format = "\\U%08x";
4287 }
4288 sprintf(hexCodePoint, format.c_str(), c);
4289
4290 // Get the class name and character name for the character.
4291 char cName[200];
4292 UErrorCode status = U_ZERO_ERROR;
4293 u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status);
4294
4295 char buffer[200];
4296 snprintf(buffer, 200,
4297 "%4s %3i : %1s %1s %10s %-*s %-40s %-40s",
4298 currentLineFlag.c_str(),
4299 ci,
4300 expectedBreaks[ci] == 0 ? "." : "|", // Reference break
4301 currentBreakData[ci] == 0 ? "." : "|", // Actual break
4302 hexCodePoint,
4303 classNameSize,
4304 mk.classNameFromCodepoint(c).c_str(),
4305 mk.getAppliedRule(ci).c_str(), cName);
4306
4307 // Output the error
4308 if (ci == i) {
4309 errln(buffer);
73c04bcf 4310 } else {
340931cb 4311 infoln(buffer);
73c04bcf 4312 }
340931cb
A
4313
4314 if (ci >= endContext) { break; }
73c04bcf 4315 }
73c04bcf
A
4316 break;
4317 }
4318 }
4319
4320 loopCount++;
4321 }
4322#endif
4323}
4324
729e4ab9
A
4325
4326// Bug 5532. UTF-8 based UText fails in dictionary code.
4327// This test checks the initial patch,
4328// which is to just keep it from crashing. Correct word boundaries
4329// await a proper fix to the dictionary code.
4330//
4331void RBBITest::TestBug5532(void) {
4332 // Text includes a mixture of Thai and Latin.
4333 const unsigned char utf8Data[] = {
4334 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
2ca993e8 4335 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
729e4ab9
A
4336 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4337 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4338 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
2ca993e8
A
4339 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4340 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4341 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4342 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4343 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
729e4ab9
A
4344 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4345
4346 UErrorCode status = U_ZERO_ERROR;
4347 UText utext=UTEXT_INITIALIZER;
4348 utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4349 TEST_ASSERT_SUCCESS(status);
4350
4351 BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4352 TEST_ASSERT_SUCCESS(status);
4353 if (U_SUCCESS(status)) {
4354 bi->setText(&utext, status);
4355 TEST_ASSERT_SUCCESS(status);
4356
4357 int32_t breakCount = 0;
4358 int32_t previousBreak = -1;
4359 for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4360 // For now, just make sure that the break iterator doesn't hang.
4361 TEST_ASSERT(previousBreak < bi->current());
4362 previousBreak = bi->current();
4363 }
4364 TEST_ASSERT(breakCount > 0);
4365 }
4366 delete bi;
4367 utext_close(&utext);
4368}
4369
4370
51004dcb
A
4371void RBBITest::TestBug9983(void) {
4372 UnicodeString text = UnicodeString("\\u002A" // * Other
4373 "\\uFF65" // Other
4374 "\\u309C" // Katakana
4375 "\\uFF9F" // Extend
4376 "\\uFF65" // Other
4377 "\\u0020" // Other
4378 "\\u0000").unescape();
4379
4380 UErrorCode status = U_ZERO_ERROR;
4381 LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4382 BreakIterator::createWordInstance(Locale::getRoot(), status)));
4383 TEST_ASSERT_SUCCESS(status);
57a6839d
A
4384 LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4385 BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4386 TEST_ASSERT_SUCCESS(status);
51004dcb
A
4387 if (U_FAILURE(status)) {
4388 return;
4389 }
57a6839d
A
4390 int32_t offset, rstatus, iterationCount;
4391
51004dcb 4392 brkiter->setText(text);
51004dcb 4393 brkiter->last();
57a6839d 4394 iterationCount = 0;
51004dcb
A
4395 while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4396 iterationCount++;
4397 rstatus = brkiter->getRuleStatus();
57a6839d
A
4398 (void)rstatus; // Suppress set but not used warning.
4399 if (iterationCount >= 10) {
2ca993e8 4400 break;
57a6839d
A
4401 }
4402 }
4403 TEST_ASSERT(iterationCount == 6);
4404
4405 brkiterPOSIX->setText(text);
4406 brkiterPOSIX->last();
4407 iterationCount = 0;
4408 while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4409 iterationCount++;
4410 rstatus = brkiterPOSIX->getRuleStatus();
4411 (void)rstatus; // Suppress set but not used warning.
51004dcb 4412 if (iterationCount >= 10) {
2ca993e8 4413 break;
51004dcb
A
4414 }
4415 }
4416 TEST_ASSERT(iterationCount == 6);
4417}
4418
f3c0d7a5
A
4419// Bug 7547 - verify that building a break itereator from empty rules produces an error.
4420//
4421void RBBITest::TestBug7547() {
4422 UnicodeString rules;
4423 UErrorCode status = U_ZERO_ERROR;
4424 UParseError parseError;
4425 RuleBasedBreakIterator breakIterator(rules, parseError, status);
4426 if (status != U_BRK_RULE_SYNTAX) {
4427 errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status));
4428 }
4429 if (parseError.line != 1 || parseError.offset != 0) {
4430 errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset);
4431 }
4432}
4433
4434
4435void RBBITest::TestBug12797() {
4436 UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
4437 UErrorCode status = U_ZERO_ERROR;
4438 UParseError parseError;
4439 RuleBasedBreakIterator bi(rules, parseError, status);
4440 if (U_FAILURE(status)) {
4441 errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status));
4442 return;
4443 }
4444 UnicodeString text = "abc";
4445 bi.setText(text);
4446 bi.first();
4447 int32_t boundary = bi.next();
4448 if (boundary != 3) {
4449 errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary);
4450 }
4451}
4452
4453void RBBITest::TestBug12918() {
4454 // This test triggers an assertion failure in dictbe.cpp
4455 const UChar *crasherString = u"\u3325\u4a16";
4456 UErrorCode status = U_ZERO_ERROR;
4457 UBreakIterator* iter = ubrk_open(UBRK_WORD, NULL, crasherString, -1, &status);
4458 if (U_FAILURE(status)) {
4459 dataerrln("%s:%d status = %s", __FILE__, __LINE__, u_errorName(status));
4460 return;
4461 }
4462 ubrk_first(iter);
4463 int32_t pos = 0;
4464 int32_t lastPos = -1;
4465 while((pos = ubrk_next(iter)) != UBRK_DONE) {
4466 if (pos <= lastPos) {
4467 errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__, __LINE__, pos, lastPos);
4468 break;
4469 }
4470 }
4471 ubrk_close(iter);
4472}
4473
4474void RBBITest::TestBug12932() {
4475 // Node Stack overflow in the RBBI rule parser caused a seg fault.
4476 UnicodeString ruleStr(
4477 "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4478 "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4479 "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((()))"
4480 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4481 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4482 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))");
4483
4484 UErrorCode status = U_ZERO_ERROR;
4485 UParseError parseError;
4486 RuleBasedBreakIterator rbbi(ruleStr, parseError, status);
4487 if (status != U_BRK_RULE_SYNTAX) {
4488 errln("%s:%d expected U_BRK_RULE_SYNTAX, got %s",
4489 __FILE__, __LINE__, u_errorName(status));
4490 }
4491}
4492
4493
4494// Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt
4495// remain undevided by ICU char, word and line break.
4496void RBBITest::TestEmoji() {
0f5d89e8 4497#if !UCONFIG_NO_REGULAR_EXPRESSIONS
f3c0d7a5
A
4498 UErrorCode status = U_ZERO_ERROR;
4499
4500 CharString testFileName;
4501 testFileName.append(IntlTest::getSourceTestData(status), status);
4502 testFileName.appendPathPart("emoji-test.txt", status);
4503 if (U_FAILURE(status)) {
4504 errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4505 return;
4506 }
4507 logln("Opening data file %s\n", testFileName.data());
4508
4509 int len;
4510 UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
4511 if (U_FAILURE(status) || testFile == NULL) {
4512 errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4513 return;
4514 }
4515 UnicodeString testFileAsString(testFile, len);
4516 delete [] testFile;
4517
4518 RegexMatcher lineMatcher(u"^.*?$", testFileAsString, UREGEX_MULTILINE, status);
4519 RegexMatcher hexMatcher(u"\\s*([a-f0-9]*)", UREGEX_CASE_INSENSITIVE, status);
4520 // hexMatcher group(1) is a hex number, or empty string if no hex number present.
4521 int32_t lineNumber = 0;
4522
4523 LocalPointer<BreakIterator> charBreaks(BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4524 LocalPointer<BreakIterator> wordBreaks(BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4525 LocalPointer<BreakIterator> lineBreaks(BreakIterator::createLineInstance(Locale::getEnglish(), status), status);
4526 if (U_FAILURE(status)) {
4527 dataerrln("%s:%d %s while opening break iterators", __FILE__, __LINE__, u_errorName(status));
4528 return;
4529 }
4530
4531 while (lineMatcher.find()) {
4532 ++lineNumber;
4533 UnicodeString line = lineMatcher.group(status);
4534 hexMatcher.reset(line);
4535 UnicodeString testString; // accumulates the emoji sequence.
4536 while (hexMatcher.find() && hexMatcher.group(1, status).length() > 0) {
4537 UnicodeString hex = hexMatcher.group(1, status);
4538 if (hex.length() > 8) {
4539 errln("%s:%d emoji-test.txt:%d invalid code point %s", __FILE__, __LINE__, lineNumber, CStr(hex)());
4540 break;
4541 }
4542 CharString hex8;
4543 hex8.appendInvariantChars(hex, status);
4544 UChar32 c = (UChar32)strtol(hex8.data(), NULL, 16);
4545 if (c<=0x10ffff) {
4546 testString.append(c);
4547 } else {
4548 errln("%s:%d emoji-test.txt:%d Error: Unicode Character %s value out of range.",
4549 __FILE__, __LINE__, lineNumber, hex8.data());
4550 break;
4551 }
4552 }
4553
4554 if (testString.length() > 1) {
4555 charBreaks->setText(testString);
4556 charBreaks->first();
4557 int32_t firstBreak = charBreaks->next();
4558 if (testString.length() != firstBreak) {
4559 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4560 __FILE__, __LINE__, lineNumber, firstBreak);
4561 }
4562 wordBreaks->setText(testString);
4563 wordBreaks->first();
4564 firstBreak = wordBreaks->next();
4565 if (testString.length() != firstBreak) {
4566 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4567 __FILE__, __LINE__, lineNumber, firstBreak);
4568 }
4569 lineBreaks->setText(testString);
4570 lineBreaks->first();
4571 firstBreak = lineBreaks->next();
4572 if (testString.length() != firstBreak) {
4573 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4574 __FILE__, __LINE__, lineNumber, firstBreak);
4575 }
4576 }
4577 }
0f5d89e8
A
4578#endif
4579}
4580
4581
4582// TestBug12519 - Correct handling of Locales by assignment / copy / clone
4583
0f5d89e8
A
4584void RBBITest::TestBug12519() {
4585 UErrorCode status = U_ZERO_ERROR;
4586 LocalPointer<RuleBasedBreakIterator> biEn((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4587 LocalPointer<RuleBasedBreakIterator> biFr((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getFrance(), status));
4588 if (!assertSuccess(WHERE, status)) {
4589 dataerrln("%s %d status = %s", __FILE__, __LINE__, u_errorName(status));
4590 return;
4591 }
4592 assertTrue(WHERE, Locale::getEnglish() == biEn->getLocale(ULOC_VALID_LOCALE, status));
4593
4594 assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status));
4595 assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr);
4596
340931cb 4597 LocalPointer<RuleBasedBreakIterator>cloneEn(biEn->clone());
0f5d89e8
A
4598 assertTrue(WHERE, *biEn == *cloneEn);
4599 assertTrue(WHERE, Locale::getEnglish() == cloneEn->getLocale(ULOC_VALID_LOCALE, status));
4600
340931cb 4601 LocalPointer<RuleBasedBreakIterator>cloneFr(biFr->clone());
0f5d89e8
A
4602 assertTrue(WHERE, *biFr == *cloneFr);
4603 assertTrue(WHERE, Locale::getFrench() == cloneFr->getLocale(ULOC_VALID_LOCALE, status));
4604
4605 LocalPointer<RuleBasedBreakIterator>biDe((RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getGerman(), status));
4606 UnicodeString text("Hallo Welt");
4607 biDe->setText(text);
4608 assertTrue(WHERE "before assignment of \"biDe = biFr\", they should be different, but are equal.", *biFr != *biDe);
4609 *biDe = *biFr;
4610 assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe);
4611}
4612
4613void RBBITest::TestBug12677() {
4614 // Check that stripping of comments from rules for getRules() is not confused by
4615 // the presence of '#' characters in the rules that do not introduce comments.
4616 UnicodeString rules(u"!!forward; \n"
4617 "$x = [ab#]; # a set with a # literal. \n"
4618 " # .; # a comment that looks sort of like a rule. \n"
4619 " '#' '?'; # a rule with a quoted # \n"
4620 );
4621
4622 UErrorCode status = U_ZERO_ERROR;
4623 UParseError pe;
4624 RuleBasedBreakIterator bi(rules, pe, status);
4625 assertSuccess(WHERE, status);
4626 UnicodeString rtRules = bi.getRules();
4627 assertEquals(WHERE, UnicodeString(u"!!forward; $x = [ab#]; '#' '?'; "), rtRules);
4628}
4629
4630
4631void RBBITest::TestTableRedundancies() {
4632 UErrorCode status = U_ZERO_ERROR;
4633
4634 LocalPointer<RuleBasedBreakIterator> bi (
4635 (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status));
4636 assertSuccess(WHERE, status);
4637 if (U_FAILURE(status)) return;
4638
4639 RBBIDataWrapper *dw = bi->fData;
4640 const RBBIStateTable *fwtbl = dw->fForwardTable;
4641 int32_t numCharClasses = dw->fHeader->fCatCount;
4642 // printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates);
4643
4644 // Check for duplicate columns (character categories)
4645
4646 std::vector<UnicodeString> columns;
4647 for (int32_t column = 0; column < numCharClasses; column++) {
4648 UnicodeString s;
4649 for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) {
4650 RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4651 s.append(row->fNextState[column]);
4652 }
4653 columns.push_back(s);
4654 }
4655 // Ignore column (char class) 0 while checking; it's special, and may have duplicates.
4656 for (int c1=1; c1<numCharClasses; c1++) {
4657 for (int c2 = c1+1; c2 < numCharClasses; c2++) {
4658 if (columns.at(c1) == columns.at(c2)) {
4659 errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__, __LINE__, c1, c2);
4660 goto out;
4661 }
4662 }
4663 }
4664 out:
4665
4666 // Check for duplicate states
4667 std::vector<UnicodeString> rows;
4668 for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) {
4669 UnicodeString s;
4670 RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4671 assertTrue(WHERE, row->fAccepting >= -1);
4672 s.append(row->fAccepting + 1); // values of -1 are expected.
4673 s.append(row->fLookAhead);
4674 s.append(row->fTagIdx);
4675 for (int32_t column = 0; column < numCharClasses; column++) {
4676 s.append(row->fNextState[column]);
4677 }
4678 rows.push_back(s);
4679 }
4680 for (int r1=0; r1 < (int32_t)fwtbl->fNumStates; r1++) {
4681 for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) {
4682 if (rows.at(r1) == rows.at(r2)) {
4683 errln("%s:%d Duplicate rows (%d, %d)\n", __FILE__, __LINE__, r1, r2);
4684 return;
4685 }
4686 }
4687 }
4688}
4689
4690// Bug 13447: verify that getRuleStatus() returns the value corresponding to current(),
4691// even after next() has returned DONE.
4692
4693void RBBITest::TestBug13447() {
4694 UErrorCode status = U_ZERO_ERROR;
4695 LocalPointer<RuleBasedBreakIterator> bi(
4696 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4697 assertSuccess(WHERE, status);
4698 if (U_FAILURE(status)) return;
4699 UnicodeString data(u"1234");
4700 bi->setText(data);
4701 assertEquals(WHERE, UBRK_WORD_NONE, bi->getRuleStatus());
4702 assertEquals(WHERE, 4, bi->next());
4703 assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4704 assertEquals(WHERE, UBRK_DONE, bi->next());
4705 assertEquals(WHERE, 4, bi->current());
4706 assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4707}
4708
4709// TestReverse exercises both the synthesized safe reverse rules and the logic
4710// for filling the break iterator cache when starting from random positions
4711// in the text.
4712//
4713// It's a monkey test, working on random data, with the expected data obtained
4714// from forward iteration (no safe rules involved), comparing with results
4715// when indexing into the interior of the string (safe rules needed).
4716
4717void RBBITest::TestReverse() {
4718 UErrorCode status = U_ZERO_ERROR;
4719
4720 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4721 BreakIterator::createCharacterInstance(Locale::getEnglish(), status)));
4722 assertSuccess(WHERE, status, true);
4723 status = U_ZERO_ERROR;
4724 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4725 BreakIterator::createWordInstance(Locale::getEnglish(), status)));
4726 assertSuccess(WHERE, status, true);
4727 status = U_ZERO_ERROR;
4728 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4729 BreakIterator::createLineInstance(Locale::getEnglish(), status)));
4730 assertSuccess(WHERE, status, true);
4731 status = U_ZERO_ERROR;
4732 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4733 BreakIterator::createSentenceInstance(Locale::getEnglish(), status)));
4734 assertSuccess(WHERE, status, true);
4735}
4736
4737void RBBITest::TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi) {
4738 if (!bi) {
4739 return;
4740 }
4741
4742 // From the mapping trie in the break iterator's internal data, create a
4743 // vector of UnicodeStrings, one for each character category, containing
4744 // all of the code points that map to that category. Unicode planes 0 and 1 only,
4745 // to avoid an execess of unassigned code points.
4746
4747 RBBIDataWrapper *data = bi->fData;
4748 int32_t categoryCount = data->fHeader->fCatCount;
4749 UTrie2 *trie = data->fTrie;
4750
4751 std::vector<UnicodeString> strings(categoryCount, UnicodeString());
4752 for (int cp=0; cp<0x1fff0; ++cp) {
4753 int cat = utrie2_get32(trie, cp);
4754 cat &= ~0x4000; // And off the dictionary bit from the category.
4755 assertTrue(WHERE, cat < categoryCount && cat >= 0);
4756 if (cat < 0 || cat >= categoryCount) return;
4757 strings[cat].append(cp);
4758 }
4759
4760 icu_rand randomGen;
4761 const int testStringLength = 10000;
4762 UnicodeString testString;
4763
4764 for (int i=0; i<testStringLength; ++i) {
4765 int charClass = randomGen() % categoryCount;
4766 if (strings[charClass].length() > 0) {
4767 int cp = strings[charClass].char32At(randomGen() % strings[charClass].length());
4768 testString.append(cp);
4769 }
4770 }
4771
4772 typedef std::pair<UBool, int32_t> Result;
4773 std::vector<Result> expectedResults;
4774 bi->setText(testString);
4775 for (int i=0; i<testString.length(); ++i) {
4776 bool isboundary = bi->isBoundary(i);
4777 int ruleStatus = bi->getRuleStatus();
4778 expectedResults.push_back(std::make_pair(isboundary, ruleStatus));
4779 }
4780
4781 for (int i=testString.length()-1; i>=0; --i) {
4782 bi->setText(testString); // clears the internal break cache
4783 Result expected = expectedResults[i];
4784 assertEquals(WHERE, expected.first, bi->isBoundary(i));
4785 assertEquals(WHERE, expected.second, bi->getRuleStatus());
4786 }
f3c0d7a5
A
4787}
4788
51004dcb 4789
0f5d89e8
A
4790// Ticket 13692 - finding word boundaries in very large numbers or words could
4791// be very time consuming. When the problem was present, this void test
4792// would run more than fifteen minutes, which is to say, the failure was noticeale.
4793
4794void RBBITest::TestBug13692() {
4795 UErrorCode status = U_ZERO_ERROR;
4796 LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4797 BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4798 if (!assertSuccess(WHERE, status, true)) {
4799 return;
4800 }
4801 constexpr int32_t LENGTH = 1000000;
4802 UnicodeString longNumber(LENGTH, (UChar32)u'3', LENGTH);
4803 for (int i=0; i<20; i+=2) {
4804 longNumber.setCharAt(i, u' ');
4805 }
4806 bi->setText(longNumber);
4807 assertFalse(WHERE, bi->isBoundary(LENGTH-5));
4808 assertSuccess(WHERE, status);
4809}
4810
73c04bcf
A
4811//
4812// TestDebug - A place-holder test for debugging purposes.
4813// For putting in fragments of other tests that can be invoked
4814// for tracing without a lot of unwanted extra stuff happening.
4815//
4816void RBBITest::TestDebug(void) {
0f5d89e8
A
4817 UErrorCode status = U_ZERO_ERROR;
4818 LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4819 BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4820 if (!assertSuccess(WHERE, status, true)) {
4821 return;
4822 }
4823 const UnicodeString &rules = bi->getRules();
4824 UParseError pe;
4825 LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
4826 assertSuccess(WHERE, status);
73c04bcf
A
4827}
4828
4388f060
A
4829void RBBITest::TestProperties() {
4830 UErrorCode errorCode = U_ZERO_ERROR;
4831 UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4832 if (!prependSet.isEmpty()) {
4833 errln(
4834 "[:GCB=Prepend:] is not empty any more. "
4835 "Uncomment relevant lines in source/data/brkitr/char.txt and "
4836 "change this test to the opposite condition.");
4837 }
4838}
4839
0f5d89e8 4840#endif // #if !UCONFIG_NO_BREAK_ITERATION