]> git.saurik.com Git - apple/icu.git/blame - icuSources/test/intltest/rbbitst.cpp
ICU-64232.0.1.tar.gz
[apple/icu.git] / icuSources / test / intltest / rbbitst.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
73c04bcf
A
3/********************************************************************
4 * COPYRIGHT:
2ca993e8 5 * Copyright (c) 1999-2016, International Business Machines Corporation and
73c04bcf
A
6 * others. All Rights Reserved.
7 ********************************************************************/
8/************************************************************************
9* Date Name Description
10* 12/15/99 Madhu Creation.
11* 01/12/2000 Madhu Updated for changed API and added new tests
12************************************************************************/
13
14#include "unicode/utypes.h"
73c04bcf
A
15#if !UCONFIG_NO_BREAK_ITERATION
16
2ca993e8
A
17#include <stdio.h>
18#include <stdlib.h>
19#include <string.h>
0f5d89e8
A
20#include <utility>
21#include <vector>
2ca993e8 22
73c04bcf 23#include "unicode/brkiter.h"
2ca993e8
A
24#include "unicode/localpointer.h"
25#include "unicode/numfmt.h"
73c04bcf 26#include "unicode/rbbi.h"
2ca993e8
A
27#if !UCONFIG_NO_REGULAR_EXPRESSIONS
28#include "unicode/regex.h"
29#endif
30#include "unicode/schriter.h"
73c04bcf
A
31#include "unicode/uchar.h"
32#include "unicode/utf16.h"
33#include "unicode/ucnv.h"
73c04bcf 34#include "unicode/uniset.h"
2ca993e8 35#include "unicode/uscript.h"
73c04bcf
A
36#include "unicode/ustring.h"
37#include "unicode/utext.h"
2ca993e8
A
38
39#include "charstr.h"
40#include "cmemory.h"
f3c0d7a5 41#include "cstr.h"
73c04bcf
A
42#include "intltest.h"
43#include "rbbitst.h"
0f5d89e8 44#include "rbbidata.h"
2ca993e8 45#include "utypeinfo.h" // for 'typeid' to work
73c04bcf
A
46#include "uvector.h"
47#include "uvectr32.h"
2ca993e8 48
0f5d89e8 49
2ca993e8
A
50#if !UCONFIG_NO_FILTERED_BREAK_ITERATION
51#include "unicode/filteredbrk.h"
52#endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
73c04bcf
A
53
54#define TEST_ASSERT(x) {if (!(x)) { \
55 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
56
46f4442e 57#define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
729e4ab9 58 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
73c04bcf 59
46f4442e
A
60//---------------------------------------------
61// runIndexedTest
62//---------------------------------------------
63
4388f060 64
2ca993e8 65// Note: Before adding new tests to this file, check whether the desired test data can
4388f060
A
66// simply be added to the file testdata/rbbitest.txt. In most cases it can,
67// it's much less work than writing a new test, diagnostic output in the event of failures
68// is good, and the test data file will is shared with ICU4J, so eventually the test
69// will run there as well, without additional effort.
70
46f4442e
A
71void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
72{
73 if (exec) logln("TestSuite RuleBasedBreakIterator: ");
f3c0d7a5 74 fTestParams = params;
46f4442e 75
f3c0d7a5 76 TESTCASE_AUTO_BEGIN;
729e4ab9 77#if !UCONFIG_NO_FILE_IO
f3c0d7a5 78 TESTCASE_AUTO(TestBug4153072);
729e4ab9 79#endif
729e4ab9 80#if !UCONFIG_NO_FILE_IO
f3c0d7a5 81 TESTCASE_AUTO(TestUnicodeFiles);
729e4ab9 82#endif
f3c0d7a5
A
83 TESTCASE_AUTO(TestGetAvailableLocales);
84 TESTCASE_AUTO(TestGetDisplayName);
729e4ab9 85#if !UCONFIG_NO_FILE_IO
f3c0d7a5
A
86 TESTCASE_AUTO(TestEndBehaviour);
87 TESTCASE_AUTO(TestWordBreaks);
88 TESTCASE_AUTO(TestWordBoundary);
89 TESTCASE_AUTO(TestLineBreaks);
90 TESTCASE_AUTO(TestSentBreaks);
91 TESTCASE_AUTO(TestExtended);
729e4ab9 92#endif
4388f060 93#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
f3c0d7a5 94 TESTCASE_AUTO(TestMonkey);
4388f060 95#endif
729e4ab9 96#if !UCONFIG_NO_FILE_IO
f3c0d7a5 97 TESTCASE_AUTO(TestBug3818);
729e4ab9 98#endif
f3c0d7a5 99 TESTCASE_AUTO(TestDebug);
729e4ab9 100#if !UCONFIG_NO_FILE_IO
f3c0d7a5 101 TESTCASE_AUTO(TestBug5775);
729e4ab9 102#endif
f3c0d7a5
A
103 TESTCASE_AUTO(TestBug9983);
104 TESTCASE_AUTO(TestDictRules);
105 TESTCASE_AUTO(TestBug5532);
106 TESTCASE_AUTO(TestBug7547);
107 TESTCASE_AUTO(TestBug12797);
108 TESTCASE_AUTO(TestBug12918);
109 TESTCASE_AUTO(TestBug12932);
110 TESTCASE_AUTO(TestEmoji);
0f5d89e8
A
111 TESTCASE_AUTO(TestBug12519);
112 TESTCASE_AUTO(TestBug12677);
113 TESTCASE_AUTO(TestTableRedundancies);
114 TESTCASE_AUTO(TestBug13447);
115 TESTCASE_AUTO(TestReverse);
116 TESTCASE_AUTO(TestBug13692);
f3c0d7a5 117 TESTCASE_AUTO_END;
46f4442e
A
118}
119
120
73c04bcf
A
121//--------------------------------------------------------------------------------------
122//
123// RBBITest constructor and destructor
124//
125//--------------------------------------------------------------------------------------
126
127RBBITest::RBBITest() {
f3c0d7a5 128 fTestParams = NULL;
73c04bcf
A
129}
130
131
132RBBITest::~RBBITest() {
73c04bcf
A
133}
134
73c04bcf 135
b331163b 136static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
73c04bcf
A
137 UErrorCode status = U_ZERO_ERROR;
138 char name[100];
139 printf("code alpha extend alphanum type word sent line name\n");
b331163b
A
140 int nextExpectedIndex = 0;
141 utext_setNativeIndex(tstr, 0);
3d1f044b 142 for (int j = 0; j < static_cast<int>(utext_nativeLength(tstr)); j=static_cast<int>(utext_getNativeIndex(tstr))) {
b331163b
A
143 if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
144 printf("------------------------------------------------ %d\n", j);
145 ++nextExpectedIndex;
73c04bcf 146 }
b331163b
A
147
148 UChar32 c = utext_next32(tstr);
73c04bcf
A
149 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
150 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
151 u_isUAlphabetic(c),
152 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
153 u_isalnum(c),
154 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
155 u_charType(c),
156 U_SHORT_PROPERTY_NAME),
157 u_getPropertyValueName(UCHAR_WORD_BREAK,
158 u_getIntPropertyValue(c,
159 UCHAR_WORD_BREAK),
160 U_SHORT_PROPERTY_NAME),
161 u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
162 u_getIntPropertyValue(c,
163 UCHAR_SENTENCE_BREAK),
164 U_SHORT_PROPERTY_NAME),
165 u_getPropertyValueName(UCHAR_LINE_BREAK,
166 u_getIntPropertyValue(c,
167 UCHAR_LINE_BREAK),
168 U_SHORT_PROPERTY_NAME),
169 name);
170 }
171}
172
73c04bcf 173
b331163b
A
174static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
175 UErrorCode status = U_ZERO_ERROR;
176 UText *tstr = NULL;
177 tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
178 if (U_FAILURE(status)) {
179 printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
180 return;
181 }
182 printStringBreaks(tstr, expected, expectedCount);
183 utext_close(tstr);
184}
185
186
73c04bcf
A
187void RBBITest::TestBug3818() {
188 UErrorCode status = U_ZERO_ERROR;
189
190 // Four Thai words...
191 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
192 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
193 UnicodeString thaiStr(thaiWordData);
194
57a6839d 195 BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
73c04bcf 196 if (U_FAILURE(status) || bi == NULL) {
729e4ab9 197 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
73c04bcf
A
198 return;
199 }
200 bi->setText(thaiStr);
201
202 int32_t startOfSecondWord = bi->following(1);
203 if (startOfSecondWord != 4) {
204 errln("Fail at file %s, line %d expected start of word at 4, got %d",
205 __FILE__, __LINE__, startOfSecondWord);
206 }
207 startOfSecondWord = bi->following(0);
208 if (startOfSecondWord != 4) {
209 errln("Fail at file %s, line %d expected start of word at 4, got %d",
210 __FILE__, __LINE__, startOfSecondWord);
211 }
212 delete bi;
213}
214
73c04bcf
A
215
216//---------------------------------------------
217//
218// other tests
219//
220//---------------------------------------------
73c04bcf
A
221
222void RBBITest::TestGetAvailableLocales()
223{
224 int32_t locCount = 0;
225 const Locale* locList = BreakIterator::getAvailableLocales(locCount);
226
227 if (locCount == 0)
729e4ab9 228 dataerrln("getAvailableLocales() returned an empty list!");
73c04bcf
A
229 // Just make sure that it's returning good memory.
230 int32_t i;
231 for (i = 0; i < locCount; ++i) {
232 logln(locList[i].getName());
233 }
234}
235
236//Testing the BreakIterator::getDisplayName() function
237void RBBITest::TestGetDisplayName()
238{
239 UnicodeString result;
240
241 BreakIterator::getDisplayName(Locale::getUS(), result);
242 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
729e4ab9 243 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
73c04bcf
A
244 + result);
245
246 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
247 if (result != "French (France)")
729e4ab9 248 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
73c04bcf
A
249 + result);
250}
251/**
252 * Test End Behaviour
253 * @bug 4068137
254 */
255void RBBITest::TestEndBehaviour()
256{
257 UErrorCode status = U_ZERO_ERROR;
258 UnicodeString testString("boo.");
259 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
260 if (U_FAILURE(status))
261 {
729e4ab9 262 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
73c04bcf
A
263 return;
264 }
265 wb->setText(testString);
266
267 if (wb->first() != 0)
268 errln("Didn't get break at beginning of string.");
269 if (wb->next() != 3)
270 errln("Didn't get break before period in \"boo.\"");
271 if (wb->current() != 4 && wb->next() != 4)
272 errln("Didn't get break at end of string.");
273 delete wb;
274}
275/*
276 * @bug 4153072
277 */
278void RBBITest::TestBug4153072() {
279 UErrorCode status = U_ZERO_ERROR;
280 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
281 if (U_FAILURE(status))
282 {
729e4ab9 283 errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
73c04bcf
A
284 return;
285 }
286 UnicodeString str("...Hello, World!...");
287 int32_t begin = 3;
288 int32_t end = str.length() - 3;
289 UBool onBoundary;
290
291 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
292 iter->adoptText(textIterator);
293 int index;
294 // Note: with the switch to UText, there is no way to restrict the
295 // iteration range to begin at an index other than zero.
296 // String character iterators created with a non-zero bound are
297 // treated by RBBI as being empty.
298 for (index = -1; index < begin + 1; ++index) {
299 onBoundary = iter->isBoundary(index);
300 if (index == 0? !onBoundary : onBoundary) {
301 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
302 " and begin index = " + begin);
303 }
304 }
305 delete iter;
306}
307
308
46f4442e
A
309//
310// Test for problem reported by Ashok Matoria on 9 July 2007
311// One.<kSoftHyphen><kSpace>Two.
312//
313// Sentence break at start (0) and then on calling next() it breaks at
314// 'T' of "Two". Now, at this point if I do next() and
315// then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
316//
317void RBBITest::TestBug5775() {
318 UErrorCode status = U_ZERO_ERROR;
319 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
320 TEST_ASSERT_SUCCESS(status);
729e4ab9
A
321 if (U_FAILURE(status)) {
322 return;
323 }
324// Check for status first for better handling of no data errors.
46f4442e 325 TEST_ASSERT(bi != NULL);
729e4ab9 326 if (bi == NULL) {
46f4442e
A
327 return;
328 }
2ca993e8 329
46f4442e
A
330 UnicodeString s("One.\\u00ad Two.", -1, US_INV);
331 // 01234 56789
332 s = s.unescape();
333 bi->setText(s);
334 int pos = bi->next();
335 TEST_ASSERT(pos == 6);
336 pos = bi->next();
337 TEST_ASSERT(pos == 10);
338 pos = bi->previous();
339 TEST_ASSERT(pos == 6);
340 delete bi;
341}
342
343
344
73c04bcf
A
345//------------------------------------------------------------------------------
346//
347// RBBITest::Extended Run RBBI Tests from an external test data file
348//
349//------------------------------------------------------------------------------
350
351struct TestParams {
b331163b
A
352 BreakIterator *bi; // Break iterator is set while parsing test source.
353 // Changed out whenever test data changes break type.
354
355 UnicodeString dataToBreak; // Data that is built up while parsing the test.
356 UVector32 *expectedBreaks; // Expected break positions, matches dataToBreak UnicodeString.
357 UVector32 *srcLine; // Positions in source file, indexed same as dataToBreak.
73c04bcf 358 UVector32 *srcCol;
b331163b
A
359
360 UText *textToBreak; // UText, could be UTF8 or UTF16.
361 UVector32 *textMap; // Map from UTF-16 dataToBreak offsets to UText offsets.
362 CharString utf8String; // UTF-8 form of text to break.
363
364 TestParams(UErrorCode &status) : dataToBreak() {
365 bi = NULL;
366 expectedBreaks = new UVector32(status);
367 srcLine = new UVector32(status);
368 srcCol = new UVector32(status);
369 textToBreak = NULL;
370 textMap = new UVector32(status);
371 }
372
373 ~TestParams() {
374 delete bi;
375 delete expectedBreaks;
376 delete srcLine;
377 delete srcCol;
378 utext_close(textToBreak);
379 delete textMap;
380 }
2ca993e8 381
b331163b
A
382 int32_t getSrcLine(int32_t bp);
383 int32_t getExpectedBreak(int32_t bp);
384 int32_t getSrcCol(int32_t bp);
385
386 void setUTF16(UErrorCode &status);
387 void setUTF8(UErrorCode &status);
73c04bcf
A
388};
389
b331163b
A
390// Append a UnicodeString to a CharString with UTF-8 encoding.
391// Substitute any invalid chars.
392// Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
393static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
394 if (U_FAILURE(status)) {
395 return;
396 }
397 int32_t utf8Length;
398 u_strToUTF8WithSub(NULL, 0, &utf8Length, // Output Buffer, NULL for preflight.
399 src.getBuffer(), src.length(), // UTF-16 data
400 0xfffd, NULL, // Substitution char, number of subs.
401 &status);
402 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
403 return;
404 }
405 status = U_ZERO_ERROR;
406 int32_t capacity;
407 char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
408 u_strToUTF8WithSub(buffer, utf8Length, NULL,
409 src.getBuffer(), src.length(),
410 0xfffd, NULL, &status);
411 dest.append(buffer, utf8Length, status);
412}
2ca993e8 413
b331163b
A
414
415void TestParams::setUTF16(UErrorCode &status) {
416 textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
417 textMap->removeAllElements();
418 for (int32_t i=0; i<dataToBreak.length(); i++) {
419 if (i == dataToBreak.getChar32Start(i)) {
420 textMap->addElement(i, status);
421 } else {
422 textMap->addElement(-1, status);
423 }
424 }
425 textMap->addElement(dataToBreak.length(), status);
426 U_ASSERT(dataToBreak.length() + 1 == textMap->size());
427}
428
429
430void TestParams::setUTF8(UErrorCode &status) {
431 if (U_FAILURE(status)) {
432 return;
433 }
434 utf8String.clear();
435 CharStringAppend(utf8String, dataToBreak, status);
436 textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
437 if (U_FAILURE(status)) {
438 return;
439 }
440
441 textMap->removeAllElements();
442 int32_t utf16Index = 0;
443 for (;;) {
444 textMap->addElement(utf16Index, status);
445 UChar32 c32 = utext_current32(textToBreak);
446 if (c32 < 0) {
447 break;
448 }
449 utf16Index += U16_LENGTH(c32);
450 utext_next32(textToBreak);
451 while (textMap->size() < utext_getNativeIndex(textToBreak)) {
452 textMap->addElement(-1, status);
453 }
454 }
455 U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
456}
457
458
f3c0d7a5 459int32_t TestParams::getSrcLine(int32_t bp) {
b331163b
A
460 if (bp >= textMap->size()) {
461 bp = textMap->size() - 1;
462 }
463 int32_t i = 0;
464 for(; bp >= 0 ; --bp) {
465 // Move to a character boundary if we are not on one already.
466 i = textMap->elementAti(bp);
467 if (i >= 0) {
468 break;
469 }
470 }
471 return srcLine->elementAti(i);
472}
473
474
f3c0d7a5 475int32_t TestParams::getExpectedBreak(int32_t bp) {
b331163b
A
476 if (bp >= textMap->size()) {
477 return 0;
478 }
479 int32_t i = textMap->elementAti(bp);
480 int32_t retVal = 0;
481 if (i >= 0) {
482 retVal = expectedBreaks->elementAti(i);
483 }
484 return retVal;
485}
486
487
f3c0d7a5 488int32_t TestParams::getSrcCol(int32_t bp) {
b331163b
A
489 if (bp >= textMap->size()) {
490 bp = textMap->size() - 1;
491 }
492 int32_t i = 0;
493 for(; bp >= 0; --bp) {
494 // Move bp to a character boundary if we are not on one already.
495 i = textMap->elementAti(bp);
496 if (i >= 0) {
497 break;
498 }
499 }
500 return srcCol->elementAti(i);
501}
502
503
504void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
73c04bcf
A
505 int32_t bp;
506 int32_t prevBP;
507 int32_t i;
508
b331163b
A
509 TEST_ASSERT_SUCCESS(status);
510 if (U_FAILURE(status)) {
511 return;
512 }
513
73c04bcf
A
514 if (t->bi == NULL) {
515 return;
516 }
517
b331163b 518 t->bi->setText(t->textToBreak, status);
73c04bcf
A
519 //
520 // Run the iterator forward
521 //
522 prevBP = -1;
523 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
524 if (prevBP == bp) {
525 // Fail for lack of forward progress.
526 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
b331163b 527 bp, t->getSrcLine(bp), t->getSrcCol(bp));
73c04bcf
A
528 break;
529 }
530
b331163b 531 // Check that there we didn't miss an expected break between the last one
73c04bcf
A
532 // and this one.
533 for (i=prevBP+1; i<bp; i++) {
b331163b 534 if (t->getExpectedBreak(i) != 0) {
73c04bcf
A
535 int expected[] = {0, i};
536 printStringBreaks(t->dataToBreak, expected, 2);
537 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
b331163b 538 i, t->getSrcLine(i), t->getSrcCol(i));
73c04bcf
A
539 }
540 }
541
542 // Check that the break we did find was expected
b331163b 543 if (t->getExpectedBreak(bp) == 0) {
73c04bcf 544 int expected[] = {0, bp};
b331163b 545 printStringBreaks(t->textToBreak, expected, 2);
73c04bcf 546 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
b331163b 547 bp, t->getSrcLine(bp), t->getSrcCol(bp));
73c04bcf
A
548 } else {
549 // The break was expected.
550 // Check that the {nnn} tag value is correct.
b331163b 551 int32_t expectedTagVal = t->getExpectedBreak(bp);
73c04bcf
A
552 if (expectedTagVal == -1) {
553 expectedTagVal = 0;
554 }
b331163b 555 int32_t line = t->getSrcLine(bp);
f3c0d7a5 556 int32_t rs = t->bi->getRuleStatus();
73c04bcf
A
557 if (rs != expectedTagVal) {
558 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
559 " Actual, Expected status = %4d, %4d",
b331163b 560 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
73c04bcf
A
561 }
562 }
563
73c04bcf
A
564 prevBP = bp;
565 }
566
567 // Verify that there were no missed expected breaks after the last one found
b331163b
A
568 for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
569 if (t->getExpectedBreak(i) != 0) {
73c04bcf 570 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
b331163b 571 i, t->getSrcLine(i), t->getSrcCol(i));
73c04bcf
A
572 }
573 }
574
575 //
576 // Run the iterator backwards, verify that the same breaks are found.
577 //
3d1f044b 578 prevBP = static_cast<int32_t>(utext_nativeLength(t->textToBreak) + 2); // start with a phony value for the last break pos seen.
0f5d89e8
A
579 bp = t->bi->last();
580 while (bp != BreakIterator::DONE) {
73c04bcf
A
581 if (prevBP == bp) {
582 // Fail for lack of progress.
583 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
b331163b 584 bp, t->getSrcLine(bp), t->getSrcCol(bp));
73c04bcf
A
585 break;
586 }
587
b331163b 588 // Check that we didn't miss an expected break between the last one
73c04bcf
A
589 // and this one. (UVector returns zeros for index out of bounds.)
590 for (i=prevBP-1; i>bp; i--) {
b331163b
A
591 if (t->getExpectedBreak(i) != 0) {
592 errln("Reverse Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
593 i, t->getSrcLine(i), t->getSrcCol(i));
73c04bcf
A
594 }
595 }
596
597 // Check that the break we did find was expected
b331163b 598 if (t->getExpectedBreak(bp) == 0) {
73c04bcf 599 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
b331163b 600 bp, t->getSrcLine(bp), t->getSrcCol(bp));
73c04bcf
A
601 } else {
602 // The break was expected.
603 // Check that the {nnn} tag value is correct.
b331163b 604 int32_t expectedTagVal = t->getExpectedBreak(bp);
73c04bcf
A
605 if (expectedTagVal == -1) {
606 expectedTagVal = 0;
607 }
b331163b
A
608 int line = t->getSrcLine(bp);
609 int32_t rs = t->bi->getRuleStatus();
73c04bcf
A
610 if (rs != expectedTagVal) {
611 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
612 " Actual, Expected status = %4d, %4d",
b331163b 613 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
73c04bcf
A
614 }
615 }
616
617 prevBP = bp;
0f5d89e8 618 bp = t->bi->previous();
73c04bcf
A
619 }
620
621 // Verify that there were no missed breaks prior to the last one found
622 for (i=prevBP-1; i>=0; i--) {
b331163b 623 if (t->getExpectedBreak(i) != 0) {
73c04bcf 624 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
b331163b 625 i, t->getSrcLine(i), t->getSrcCol(i));
73c04bcf
A
626 }
627 }
51004dcb
A
628
629 // Check isBoundary()
b331163b
A
630 for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
631 UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
51004dcb
A
632 UBool boundaryFound = t->bi->isBoundary(i);
633 if (boundaryExpected != boundaryFound) {
634 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
635 " Expected, Actual= %s, %s",
b331163b 636 i, t->getSrcLine(i), t->getSrcCol(i),
51004dcb
A
637 boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
638 }
639 }
640
641 // Check following()
3d1f044b 642 for (i=0; i < static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i++) {
51004dcb
A
643 int32_t actualBreak = t->bi->following(i);
644 int32_t expectedBreak = BreakIterator::DONE;
3d1f044b 645 for (int32_t j=i+1; j <= static_cast<int32_t>(utext_nativeLength(t->textToBreak)); j++) {
b331163b 646 if (t->getExpectedBreak(j) != 0) {
51004dcb
A
647 expectedBreak = j;
648 break;
649 }
650 }
651 if (expectedBreak != actualBreak) {
652 errln("following(%d) incorrect. File line,col= %4d,%4d\n"
653 " Expected, Actual= %d, %d",
b331163b 654 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
51004dcb
A
655 }
656 }
657
658 // Check preceding()
3d1f044b 659 for (i=static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i>=0; i--) {
51004dcb
A
660 int32_t actualBreak = t->bi->preceding(i);
661 int32_t expectedBreak = BreakIterator::DONE;
662
b331163b
A
663 // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
664 // preceding(trailing byte) will return the index of some preceding code point,
665 // not the lead byte of the current code point, even though that has a smaller index.
666 // Therefore, start looking at the expected break data not at i-1, but at
667 // the start of code point index - 1.
668 utext_setNativeIndex(t->textToBreak, i);
3d1f044b 669 int32_t j = static_cast<int32_t>(utext_getNativeIndex(t->textToBreak) - 1);
b331163b
A
670 for (; j >= 0; j--) {
671 if (t->getExpectedBreak(j) != 0) {
51004dcb
A
672 expectedBreak = j;
673 break;
674 }
675 }
676 if (expectedBreak != actualBreak) {
677 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
678 " Expected, Actual= %d, %d",
b331163b 679 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
51004dcb
A
680 }
681 }
73c04bcf
A
682}
683
684
685void RBBITest::TestExtended() {
f3c0d7a5
A
686 // Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This
687 // data driven test closely entangles filtered and regular data.
688#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION
73c04bcf
A
689 UErrorCode status = U_ZERO_ERROR;
690 Locale locale("");
691
b331163b 692 TestParams tp(status);
73c04bcf 693
0f5d89e8 694 RegexMatcher localeMatcher(UnicodeString(u"<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
729e4ab9
A
695 if (U_FAILURE(status)) {
696 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
697 }
73c04bcf 698
73c04bcf
A
699 //
700 // Open and read the test data file.
701 //
702 const char *testDataDirectory = IntlTest::getSourceTestData(status);
0f5d89e8
A
703 CharString testFileName(testDataDirectory, -1, status);
704 testFileName.append("rbbitst.txt", -1, status);
73c04bcf
A
705
706 int len;
0f5d89e8 707 UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
73c04bcf 708 if (U_FAILURE(status)) {
0f5d89e8
A
709 errln("%s:%d Error %s opening file rbbitst.txt", __FILE__, __LINE__, u_errorName(status));
710 return;
73c04bcf
A
711 }
712
2ca993e8 713 bool skipTest = false; // Skip this test?
46f4442e 714
73c04bcf
A
715 //
716 // Put the test data into a UnicodeString
717 //
718 UnicodeString testString(FALSE, testFile, len);
719
720 enum EParseState{
721 PARSE_COMMENT,
722 PARSE_TAG,
723 PARSE_DATA,
0f5d89e8
A
724 PARSE_NUM,
725 PARSE_RULES
73c04bcf
A
726 }
727 parseState = PARSE_TAG;
728
729 EParseState savedState = PARSE_TAG;
730
73c04bcf
A
731 int32_t lineNum = 1;
732 int32_t colStart = 0;
733 int32_t column = 0;
734 int32_t charIdx = 0;
735
0f5d89e8
A
736 int32_t tagValue = 0; // The numeric value of a <nnn> tag.
737
738 UnicodeString rules; // Holds rules from a <rules> ... </rules> block
739 int32_t rulesFirstLine; // Line number of the start of current <rules> block
73c04bcf
A
740
741 for (charIdx = 0; charIdx < len; ) {
742 status = U_ZERO_ERROR;
743 UChar c = testString.charAt(charIdx);
744 charIdx++;
f3c0d7a5 745 if (c == u'\r' && charIdx<len && testString.charAt(charIdx) == u'\n') {
73c04bcf 746 // treat CRLF as a unit
f3c0d7a5 747 c = u'\n';
73c04bcf
A
748 charIdx++;
749 }
f3c0d7a5 750 if (c == u'\n' || c == u'\r') {
73c04bcf
A
751 lineNum++;
752 colStart = charIdx;
753 }
754 column = charIdx - colStart + 1;
755
756 switch (parseState) {
757 case PARSE_COMMENT:
f3c0d7a5 758 if (c == u'\n' || c == u'\r') {
73c04bcf
A
759 parseState = savedState;
760 }
761 break;
762
763 case PARSE_TAG:
764 {
f3c0d7a5 765 if (c == u'#') {
73c04bcf
A
766 parseState = PARSE_COMMENT;
767 savedState = PARSE_TAG;
768 break;
769 }
770 if (u_isUWhiteSpace(c)) {
771 break;
772 }
0f5d89e8 773 if (testString.compare(charIdx-1, 6, u"<word>") == 0) {
73c04bcf
A
774 delete tp.bi;
775 tp.bi = BreakIterator::createWordInstance(locale, status);
2ca993e8 776 skipTest = false;
73c04bcf
A
777 charIdx += 5;
778 break;
779 }
0f5d89e8 780 if (testString.compare(charIdx-1, 6, u"<char>") == 0) {
73c04bcf
A
781 delete tp.bi;
782 tp.bi = BreakIterator::createCharacterInstance(locale, status);
2ca993e8 783 skipTest = false;
73c04bcf
A
784 charIdx += 5;
785 break;
786 }
0f5d89e8 787 if (testString.compare(charIdx-1, 6, u"<line>") == 0) {
73c04bcf
A
788 delete tp.bi;
789 tp.bi = BreakIterator::createLineInstance(locale, status);
2ca993e8 790 skipTest = false;
73c04bcf
A
791 charIdx += 5;
792 break;
793 }
0f5d89e8 794 if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
73c04bcf 795 delete tp.bi;
46f4442e 796 tp.bi = BreakIterator::createSentenceInstance(locale, status);
2ca993e8 797 skipTest = false;
73c04bcf
A
798 charIdx += 5;
799 break;
800 }
0f5d89e8 801 if (testString.compare(charIdx-1, 7, u"<title>") == 0) {
73c04bcf
A
802 delete tp.bi;
803 tp.bi = BreakIterator::createTitleInstance(locale, status);
804 charIdx += 6;
805 break;
806 }
46f4442e 807
0f5d89e8
A
808 if (testString.compare(charIdx-1, 7, u"<rules>") == 0 ||
809 testString.compare(charIdx-1, 10, u"<badrules>") == 0) {
810 charIdx = testString.indexOf(u'>', charIdx) + 1;
811 parseState = PARSE_RULES;
812 rules.remove();
813 rulesFirstLine = lineNum;
814 break;
815 }
816
73c04bcf
A
817 // <locale loc_name>
818 localeMatcher.reset(testString);
819 if (localeMatcher.lookingAt(charIdx-1, status)) {
820 UnicodeString localeName = localeMatcher.group(1, status);
821 char localeName8[100];
822 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
823 locale = Locale::createFromName(localeName8);
51004dcb 824 charIdx += localeMatcher.group(0, status).length() - 1;
73c04bcf
A
825 TEST_ASSERT_SUCCESS(status);
826 break;
827 }
0f5d89e8 828 if (testString.compare(charIdx-1, 6, u"<data>") == 0) {
73c04bcf
A
829 parseState = PARSE_DATA;
830 charIdx += 5;
831 tp.dataToBreak = "";
832 tp.expectedBreaks->removeAllElements();
833 tp.srcCol ->removeAllElements();
834 tp.srcLine->removeAllElements();
835 break;
836 }
837
838 errln("line %d: Tag expected in test file.", lineNum);
73c04bcf
A
839 parseState = PARSE_COMMENT;
840 savedState = PARSE_DATA;
46f4442e 841 goto end_test; // Stop the test.
73c04bcf
A
842 }
843 break;
844
0f5d89e8
A
845 case PARSE_RULES:
846 if (testString.compare(charIdx-1, 8, u"</rules>") == 0) {
847 charIdx += 7;
848 parseState = PARSE_TAG;
849 delete tp.bi;
850 UParseError pe;
851 tp.bi = new RuleBasedBreakIterator(rules, pe, status);
852 skipTest = U_FAILURE(status);
853 if (U_FAILURE(status)) {
854 errln("file rbbitst.txt: %d - Error %s creating break iterator from rules.",
855 rulesFirstLine + pe.line - 1, u_errorName(status));
856 }
857 } else if (testString.compare(charIdx-1, 11, u"</badrules>") == 0) {
858 charIdx += 10;
859 parseState = PARSE_TAG;
860 UErrorCode ec = U_ZERO_ERROR;
861 UParseError pe;
862 RuleBasedBreakIterator bi(rules, pe, ec);
863 if (U_SUCCESS(ec)) {
864 errln("file rbbitst.txt: %d - Expected, but did not get, a failure creating break iterator from rules.",
865 rulesFirstLine + pe.line - 1);
866 }
867 } else {
868 rules.append(c);
869 }
870 break;
871
73c04bcf 872 case PARSE_DATA:
f3c0d7a5 873 if (c == u'\u2022') { // u'•'
73c04bcf
A
874 int32_t breakIdx = tp.dataToBreak.length();
875 tp.expectedBreaks->setSize(breakIdx+1);
876 tp.expectedBreaks->setElementAt(-1, breakIdx);
877 tp.srcLine->setSize(breakIdx+1);
878 tp.srcLine->setElementAt(lineNum, breakIdx);
879 tp.srcCol ->setSize(breakIdx+1);
880 tp.srcCol ->setElementAt(column, breakIdx);
881 break;
882 }
883
0f5d89e8 884 if (testString.compare(charIdx-1, 7, u"</data>") == 0) {
73c04bcf
A
885 // Add final entry to mappings from break location to source file position.
886 // Need one extra because last break position returned is after the
887 // last char in the data, not at the last char.
888 tp.srcLine->addElement(lineNum, status);
889 tp.srcCol ->addElement(column, status);
890
891 parseState = PARSE_TAG;
892 charIdx += 6;
893
2ca993e8
A
894 if (!skipTest) {
895 // RUN THE TEST!
896 status = U_ZERO_ERROR;
897 tp.setUTF16(status);
898 executeTest(&tp, status);
899 TEST_ASSERT_SUCCESS(status);
900
901 // Run again, this time with UTF-8 text wrapped in a UText.
902 status = U_ZERO_ERROR;
903 tp.setUTF8(status);
904 TEST_ASSERT_SUCCESS(status);
905 executeTest(&tp, status);
906 }
73c04bcf
A
907 break;
908 }
909
0f5d89e8 910 if (testString.compare(charIdx-1, 3, u"\\N{") == 0) {
73c04bcf
A
911 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
912 // Get the code point from the name and insert it into the test data.
913 // (Damn, no API takes names in Unicode !!!
914 // we've got to take it back to char *)
f3c0d7a5 915 int32_t nameEndIdx = testString.indexOf(u'}', charIdx);
73c04bcf
A
916 int32_t nameLength = nameEndIdx - (charIdx+2);
917 char charNameBuf[200];
918 UChar32 theChar = -1;
919 if (nameEndIdx != -1) {
920 UErrorCode status = U_ZERO_ERROR;
921 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
922 charNameBuf[sizeof(charNameBuf)-1] = 0;
923 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
924 if (U_FAILURE(status)) {
925 theChar = -1;
926 }
927 }
928 if (theChar == -1) {
929 errln("Error in named character in test file at line %d, col %d",
930 lineNum, column);
931 } else {
932 // Named code point was recognized. Insert it
933 // into the test data.
934 tp.dataToBreak.append(theChar);
935 while (tp.dataToBreak.length() > tp.srcLine->size()) {
936 tp.srcLine->addElement(lineNum, status);
937 tp.srcCol ->addElement(column, status);
938 }
939 }
940 if (nameEndIdx > charIdx) {
941 charIdx = nameEndIdx+1;
942
943 }
944 break;
945 }
946
947
948
0f5d89e8 949 if (testString.compare(charIdx-1, 2, u"<>") == 0) {
73c04bcf
A
950 charIdx++;
951 int32_t breakIdx = tp.dataToBreak.length();
952 tp.expectedBreaks->setSize(breakIdx+1);
953 tp.expectedBreaks->setElementAt(-1, breakIdx);
954 tp.srcLine->setSize(breakIdx+1);
955 tp.srcLine->setElementAt(lineNum, breakIdx);
956 tp.srcCol ->setSize(breakIdx+1);
957 tp.srcCol ->setElementAt(column, breakIdx);
958 break;
959 }
960
f3c0d7a5 961 if (c == u'<') {
73c04bcf
A
962 tagValue = 0;
963 parseState = PARSE_NUM;
964 break;
965 }
966
f3c0d7a5 967 if (c == u'#' && column==3) { // TODO: why is column off so far?
73c04bcf
A
968 parseState = PARSE_COMMENT;
969 savedState = PARSE_DATA;
970 break;
971 }
972
f3c0d7a5 973 if (c == u'\\') {
73c04bcf
A
974 // Check for \ at end of line, a line continuation.
975 // Advance over (discard) the newline
976 UChar32 cp = testString.char32At(charIdx);
f3c0d7a5 977 if (cp == u'\r' && charIdx<len && testString.charAt(charIdx+1) == u'\n') {
73c04bcf
A
978 // We have a CR LF
979 // Need an extra increment of the input ptr to move over both of them
980 charIdx++;
981 }
f3c0d7a5 982 if (cp == u'\n' || cp == u'\r') {
73c04bcf
A
983 lineNum++;
984 colStart = charIdx;
985 charIdx++;
986 break;
987 }
988
989 // Let unescape handle the back slash.
990 cp = testString.unescapeAt(charIdx);
991 if (cp != -1) {
992 // Escape sequence was recognized. Insert the char
993 // into the test data.
994 tp.dataToBreak.append(cp);
995 while (tp.dataToBreak.length() > tp.srcLine->size()) {
996 tp.srcLine->addElement(lineNum, status);
997 tp.srcCol ->addElement(column, status);
998 }
999 break;
1000 }
1001
1002
1003 // Not a recognized backslash escape sequence.
1004 // Take the next char as a literal.
1005 // TODO: Should this be an error?
1006 c = testString.charAt(charIdx);
1007 charIdx = testString.moveIndex32(charIdx, 1);
1008 }
1009
1010 // Normal, non-escaped data char.
1011 tp.dataToBreak.append(c);
1012
1013 // Save the mapping from offset in the data to line/column numbers in
1014 // the original input file. Will be used for better error messages only.
1015 // If there's an expected break before this char, the slot in the mapping
1016 // vector will already be set for this char; don't overwrite it.
1017 if (tp.dataToBreak.length() > tp.srcLine->size()) {
1018 tp.srcLine->addElement(lineNum, status);
1019 tp.srcCol ->addElement(column, status);
1020 }
1021 break;
1022
1023
1024 case PARSE_NUM:
1025 // We are parsing an expected numeric tag value, like <1234>,
1026 // within a chunk of data.
1027 if (u_isUWhiteSpace(c)) {
1028 break;
1029 }
1030
f3c0d7a5 1031 if (c == u'>') {
73c04bcf
A
1032 // Finished the number. Add the info to the expected break data,
1033 // and switch parse state back to doing plain data.
1034 parseState = PARSE_DATA;
1035 if (tagValue == 0) {
1036 tagValue = -1;
1037 }
1038 int32_t breakIdx = tp.dataToBreak.length();
1039 tp.expectedBreaks->setSize(breakIdx+1);
1040 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1041 tp.srcLine->setSize(breakIdx+1);
1042 tp.srcLine->setElementAt(lineNum, breakIdx);
1043 tp.srcCol ->setSize(breakIdx+1);
1044 tp.srcCol ->setElementAt(column, breakIdx);
1045 break;
1046 }
1047
1048 if (u_isdigit(c)) {
1049 tagValue = tagValue*10 + u_charDigitValue(c);
1050 break;
1051 }
1052
1053 errln("Syntax Error in test file at line %d, col %d",
1054 lineNum, column);
73c04bcf 1055 parseState = PARSE_COMMENT;
46f4442e 1056 goto end_test; // Stop the test
73c04bcf
A
1057 break;
1058 }
1059
1060
1061 if (U_FAILURE(status)) {
4388f060 1062 dataerrln("ICU Error %s while parsing test file at line %d.",
73c04bcf 1063 u_errorName(status), lineNum);
73c04bcf 1064 status = U_ZERO_ERROR;
46f4442e 1065 goto end_test; // Stop the test
73c04bcf
A
1066 }
1067
1068 }
1069
0f5d89e8
A
1070 // Reached end of test file. Raise an error if parseState indicates that we are
1071 // within a block that should have been terminated.
1072
1073 if (parseState == PARSE_RULES) {
1074 errln("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
1075 lineNum, rulesFirstLine);
1076 }
1077 if (parseState == PARSE_DATA) {
1078 errln("rbbitst.txt:%d <data> block not closed.", lineNum);
1079 }
1080
1081
73c04bcf 1082end_test:
73c04bcf
A
1083 delete [] testFile;
1084#endif
1085}
1086
729e4ab9
A
1087
1088//-------------------------------------------------------------------------------
1089//
1090// TestDictRules create a break iterator from source rules that includes a
1091// dictionary range. Regression for bug #7130. Source rules
1092// do not declare a break iterator type (word, line, sentence, etc.
1093// but the dictionary code, without a type, would loop.
1094//
1095//-------------------------------------------------------------------------------
1096void RBBITest::TestDictRules() {
1097 const char *rules = "$dictionary = [a-z]; \n"
1098 "!!forward; \n"
1099 "$dictionary $dictionary; \n"
1100 "!!reverse; \n"
1101 "$dictionary $dictionary; \n";
1102 const char *text = "aa";
1103 UErrorCode status = U_ZERO_ERROR;
1104 UParseError parseError;
1105
1106 RuleBasedBreakIterator bi(rules, parseError, status);
1107 if (U_SUCCESS(status)) {
1108 UnicodeString utext = text;
1109 bi.setText(utext);
1110 int32_t position;
1111 int32_t loops;
1112 for (loops = 0; loops<10; loops++) {
1113 position = bi.next();
1114 if (position == RuleBasedBreakIterator::DONE) {
1115 break;
1116 }
1117 }
1118 TEST_ASSERT(loops == 1);
1119 } else {
1120 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1121 }
1122}
1123
1124
73c04bcf
A
1125
1126//-------------------------------------------------------------------------------
1127//
1128// ReadAndConvertFile Read a text data file, convert it to UChars, and
2ca993e8 1129// return the data in one big UChar * buffer, which the caller must delete.
73c04bcf 1130//
46f4442e
A
1131// parameters:
1132// fileName: the name of the file, with no directory part. The test data directory
1133// is assumed.
1134// ulen an out parameter, receives the actual length (in UChars) of the file data.
1135// encoding The file encoding. If the file contains a BOM, that will override the encoding
1136// specified here. The BOM, if it exists, will be stripped from the returned data.
1137// Pass NULL for the system default encoding.
1138// status
1139// returns:
1140// The file data, converted to UChar.
1141// The caller must delete this when done with
1142// delete [] theBuffer;
1143//
73c04bcf
A
1144// TODO: This is a clone of RegexTest::ReadAndConvertFile.
1145// Move this function to some common place.
1146//
1147//--------------------------------------------------------------------------------
46f4442e 1148UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
73c04bcf
A
1149 UChar *retPtr = NULL;
1150 char *fileBuf = NULL;
1151 UConverter* conv = NULL;
1152 FILE *f = NULL;
1153
1154 ulen = 0;
1155 if (U_FAILURE(status)) {
1156 return retPtr;
1157 }
1158
1159 //
1160 // Open the file.
1161 //
1162 f = fopen(fileName, "rb");
1163 if (f == 0) {
729e4ab9 1164 dataerrln("Error opening test data file %s\n", fileName);
73c04bcf
A
1165 status = U_FILE_ACCESS_ERROR;
1166 return NULL;
1167 }
1168 //
1169 // Read it in
1170 //
1171 int fileSize;
1172 int amt_read;
1173
1174 fseek( f, 0, SEEK_END);
1175 fileSize = ftell(f);
1176 fileBuf = new char[fileSize];
1177 fseek(f, 0, SEEK_SET);
3d1f044b 1178 amt_read = static_cast<int>(fread(fileBuf, 1, fileSize, f));
73c04bcf
A
1179 if (amt_read != fileSize || fileSize <= 0) {
1180 errln("Error reading test data file.");
1181 goto cleanUpAndReturn;
1182 }
1183
1184 //
1185 // Look for a Unicode Signature (BOM) on the data just read
1186 //
1187 int32_t signatureLength;
1188 const char * fileBufC;
46f4442e 1189 const char* bomEncoding;
73c04bcf
A
1190
1191 fileBufC = fileBuf;
46f4442e 1192 bomEncoding = ucnv_detectUnicodeSignature(
73c04bcf 1193 fileBuf, fileSize, &signatureLength, &status);
46f4442e 1194 if(bomEncoding!=NULL ){
73c04bcf
A
1195 fileBufC += signatureLength;
1196 fileSize -= signatureLength;
46f4442e 1197 encoding = bomEncoding;
73c04bcf
A
1198 }
1199
1200 //
1201 // Open a converter to take the rule file to UTF-16
1202 //
1203 conv = ucnv_open(encoding, &status);
1204 if (U_FAILURE(status)) {
1205 goto cleanUpAndReturn;
1206 }
1207
1208 //
1209 // Convert the rules to UChar.
1210 // Preflight first to determine required buffer size.
1211 //
1212 ulen = ucnv_toUChars(conv,
1213 NULL, // dest,
1214 0, // destCapacity,
1215 fileBufC,
1216 fileSize,
1217 &status);
1218 if (status == U_BUFFER_OVERFLOW_ERROR) {
1219 // Buffer Overflow is expected from the preflight operation.
1220 status = U_ZERO_ERROR;
1221
1222 retPtr = new UChar[ulen+1];
1223 ucnv_toUChars(conv,
1224 retPtr, // dest,
1225 ulen+1,
1226 fileBufC,
1227 fileSize,
1228 &status);
1229 }
1230
1231cleanUpAndReturn:
1232 fclose(f);
1233 delete []fileBuf;
1234 ucnv_close(conv);
1235 if (U_FAILURE(status)) {
1236 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4388f060 1237 delete []retPtr;
73c04bcf
A
1238 retPtr = 0;
1239 ulen = 0;
1240 };
1241 return retPtr;
1242}
1243
1244
73c04bcf 1245
46f4442e 1246//--------------------------------------------------------------------------------------------
73c04bcf 1247//
46f4442e 1248// Run tests from each of the boundary test data files distributed by the Unicode Consortium
73c04bcf 1249//
46f4442e
A
1250//-------------------------------------------------------------------------------------------
1251void RBBITest::TestUnicodeFiles() {
1252 RuleBasedBreakIterator *bi;
1253 UErrorCode status = U_ZERO_ERROR;
73c04bcf 1254
729e4ab9 1255 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
46f4442e
A
1256 TEST_ASSERT_SUCCESS(status);
1257 if (U_SUCCESS(status)) {
1258 runUnicodeTestData("GraphemeBreakTest.txt", bi);
1259 }
1260 delete bi;
73c04bcf 1261
729e4ab9 1262 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
46f4442e
A
1263 TEST_ASSERT_SUCCESS(status);
1264 if (U_SUCCESS(status)) {
1265 runUnicodeTestData("WordBreakTest.txt", bi);
1266 }
1267 delete bi;
73c04bcf 1268
729e4ab9 1269 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
46f4442e
A
1270 TEST_ASSERT_SUCCESS(status);
1271 if (U_SUCCESS(status)) {
1272 runUnicodeTestData("SentenceBreakTest.txt", bi);
1273 }
1274 delete bi;
73c04bcf 1275
729e4ab9 1276 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
46f4442e
A
1277 TEST_ASSERT_SUCCESS(status);
1278 if (U_SUCCESS(status)) {
1279 runUnicodeTestData("LineBreakTest.txt", bi);
73c04bcf 1280 }
46f4442e 1281 delete bi;
73c04bcf
A
1282}
1283
1284
b331163b 1285// Check for test cases from the Unicode test data files that are known to fail
3d1f044b
A
1286// and should be skipped as known issues because ICU does not fully implement
1287// the Unicode specifications, or because ICU includes tailorings that differ from
1288// the Unicode standard.
1289//
1290// Test cases are identified by the test data sequence, which tends to be more stable
1291// across Unicode versions than the test file line numbers.
1292//
1293// The test case with ticket "10666" is a dummy, included as an example.
b331163b
A
1294
1295UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
f3c0d7a5 1296 static struct TestCase {
3d1f044b 1297 const char *fTicketNum;
f3c0d7a5
A
1298 const char *fFileName;
1299 const UChar *fString;
3d1f044b
A
1300 } badTestCases[] = {
1301 {"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"}, // Fake example, for illustration.
1302 // Issue 8151, move the Finnish tailoring of the line break of hyphens to root.
1303 // This probably ultimately wants to be resolved by updating UAX-14, but in the mean time
1304 // ICU is out of sync with Unicode.
1305 {"8151", "LineBreakTest.txt", u"-#"},
1306 {"8151", "LineBreakTest.txt", u"\u002d\u0308\u0023"},
1307 {"8151", "LineBreakTest.txt", u"\u002d\u00a7"},
1308 {"8151", "LineBreakTest.txt", u"\u002d\u0308\u00a7"},
1309 {"8151", "LineBreakTest.txt", u"\u002d\U00050005"},
1310 {"8151", "LineBreakTest.txt", u"\u002d\u0308\U00050005"},
1311 {"8151", "LineBreakTest.txt", u"\u002d\u0e01"},
1312 {"8151", "LineBreakTest.txt", u"\u002d\u0308\u0e01"},
1313
1314 // Issue ICU-12017 Improve line break around numbers
1315 {"12017", "LineBreakTest.txt", u"\u002C\u0030"}, // ",0"
1316 {"12017", "LineBreakTest.txt", u"\u002C\u0308\u0030"},
1317 {"12017", "LineBreakTest.txt", u"find .com"},
1318 {"12017", "LineBreakTest.txt", u"equals .35 cents"},
1319 {"12017", "LineBreakTest.txt", u"a.2 "},
1320 {"12017", "LineBreakTest.txt", u"a.2 \u0915"},
1321 {"12017", "LineBreakTest.txt", u"a.2 \u672C"},
1322 {"12017", "LineBreakTest.txt", u"a.2\u3000\u672C"},
1323 {"12017", "LineBreakTest.txt", u"a.2\u3000\u307E"},
1324 {"12017", "LineBreakTest.txt", u"a.2\u3000\u0033"},
1325 {"12017", "LineBreakTest.txt", u"A.1 \uBABB"},
1326 {"12017", "LineBreakTest.txt", u"\uBD24\uC5B4\u002E\u0020\u0041\u002E\u0032\u0020\uBCFC"},
1327 {"12017", "LineBreakTest.txt", u"\uBD10\uC694\u002E\u0020\u0041\u002E\u0033\u0020\uBABB"},
1328 {"12017", "LineBreakTest.txt", u"\uC694\u002E\u0020\u0041\u002E\u0034\u0020\uBABB"},
1329 {"12017", "LineBreakTest.txt", u"a.2\u3000\u300C"},
b331163b 1330 };
b331163b 1331
f3c0d7a5
A
1332 for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
1333 const TestCase &badCase = badTestCases[n];
1334 if (!strcmp(fileName, badCase.fFileName) &&
1335 testCase == UnicodeString(badCase.fString)) {
3d1f044b 1336 return logKnownIssue(badCase.fTicketNum);
b331163b
A
1337 }
1338 }
1339 return FALSE;
1340}
1341
1342
46f4442e
A
1343//--------------------------------------------------------------------------------------------
1344//
1345// Run tests from one of the boundary test data files distributed by the Unicode Consortium
1346//
1347//-------------------------------------------------------------------------------------------
1348void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1349#if !UCONFIG_NO_REGULAR_EXPRESSIONS
1350 UErrorCode status = U_ZERO_ERROR;
73c04bcf 1351
46f4442e
A
1352 //
1353 // Open and read the test data file, put it into a UnicodeString.
1354 //
1355 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1356 char testFileName[1000];
1357 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
729e4ab9 1358 dataerrln("Can't open test data. Path too long.");
73c04bcf
A
1359 return;
1360 }
46f4442e
A
1361 strcpy(testFileName, testDataDirectory);
1362 strcat(testFileName, fileName);
2ca993e8 1363
46f4442e 1364 logln("Opening data file %s\n", fileName);
73c04bcf 1365
46f4442e
A
1366 int len;
1367 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1368 if (status != U_FILE_ACCESS_ERROR) {
1369 TEST_ASSERT_SUCCESS(status);
1370 TEST_ASSERT(testFile != NULL);
1371 }
1372 if (U_FAILURE(status) || testFile == NULL) {
1373 return; /* something went wrong, error already output */
1374 }
1375 UnicodeString testFileAsString(TRUE, testFile, len);
73c04bcf 1376
46f4442e
A
1377 //
1378 // Parse the test data file using a regular expression.
1379 // Each kind of token is recognized in its own capture group; what type of item was scanned
1380 // is identified by which group had a match.
1381 //
1382 // Caputure Group # 1 2 3 4 5
1383 // Parses this item: divide x hex digits comment \n unrecognized \n
1384 //
1385 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1386 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1387 UnicodeString testString;
1388 UVector32 breakPositions(status);
1389 int lineNumber = 1;
1390 TEST_ASSERT_SUCCESS(status);
1391 if (U_FAILURE(status)) {
73c04bcf
A
1392 return;
1393 }
1394
46f4442e
A
1395 //
1396 // Scan through each test case, building up the string to be broken in testString,
1397 // and the positions that should be boundaries in the breakPositions vector.
1398 //
729e4ab9 1399 int spin = 0;
46f4442e 1400 while (tokenMatcher.find()) {
729e4ab9
A
1401 if(tokenMatcher.hitEnd()) {
1402 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for.
1403 This occurred when the text file was corrupt (wasn't marked as UTF-8)
1404 and caused an infinite loop here on EBCDIC systems!
1405 */
1406 fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1407 // return;
1408 }
46f4442e
A
1409 if (tokenMatcher.start(1, status) >= 0) {
1410 // Scanned a divide sign, indicating a break position in the test data.
1411 if (testString.length()>0) {
1412 breakPositions.addElement(testString.length(), status);
73c04bcf 1413 }
46f4442e
A
1414 }
1415 else if (tokenMatcher.start(2, status) >= 0) {
1416 // Scanned an 'x', meaning no break at this position in the test data
1417 // Nothing to be done here.
1418 }
1419 else if (tokenMatcher.start(3, status) >= 0) {
1420 // Scanned Hex digits. Convert them to binary, append to the character data string.
1421 const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1422 int length = hexNumber.length();
1423 if (length<=8) {
1424 char buf[10];
1425 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1426 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1427 if (c<=0x10ffff) {
1428 testString.append(c);
1429 } else {
1430 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1431 fileName, lineNumber);
1432 }
1433 } else {
1434 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1435 fileName, lineNumber);
1436 }
1437 }
1438 else if (tokenMatcher.start(4, status) >= 0) {
1439 // Scanned to end of a line, possibly skipping over a comment in the process.
1440 // If the line from the file contained test data, run the test now.
2ca993e8 1441 if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
46f4442e 1442 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
73c04bcf
A
1443 }
1444
46f4442e
A
1445 // Clear out this test case.
1446 // The string and breakPositions vector will be refilled as the next
1447 // test case is parsed.
1448 testString.remove();
1449 breakPositions.removeAllElements();
1450 lineNumber++;
1451 } else {
1452 // Scanner catchall. Something unrecognized appeared on the line.
1453 char token[16];
1454 UnicodeString uToken = tokenMatcher.group(0, status);
1455 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1456 token[sizeof(token)-1] = 0;
1457 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token);
1458
1459 // Clean up, in preparation for continuing with the next line.
1460 testString.remove();
1461 breakPositions.removeAllElements();
1462 lineNumber++;
1463 }
1464 TEST_ASSERT_SUCCESS(status);
1465 if (U_FAILURE(status)) {
73c04bcf
A
1466 break;
1467 }
46f4442e 1468 }
73c04bcf 1469
46f4442e
A
1470 delete [] testFile;
1471 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
1472}
73c04bcf 1473
46f4442e
A
1474//--------------------------------------------------------------------------------------------
1475//
1476// checkUnicodeTestCase() Run one test case from one of the Unicode Consortium
1477// test data files. Do only a simple, forward-only check -
1478// this test is mostly to check that ICU and the Unicode
1479// data agree with each other.
1480//
1481//--------------------------------------------------------------------------------------------
1482void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1483 const UnicodeString &testString, // Text data to be broken
1484 UVector32 *breakPositions, // Positions where breaks should be found.
1485 RuleBasedBreakIterator *bi) {
1486 int32_t pos; // Break Position in the test string
1487 int32_t expectedI = 0; // Index of expected break position in the vector of expected results.
1488 int32_t expectedPos; // Expected break position (index into test string)
1489
1490 bi->setText(testString);
1491 pos = bi->first();
1492 pos = bi->next();
1493
1494 while (pos != BreakIterator::DONE) {
1495 if (expectedI >= breakPositions->size()) {
1496 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1497 testFileName, lineNumber, pos);
1498 break;
73c04bcf 1499 }
46f4442e
A
1500 expectedPos = breakPositions->elementAti(expectedI);
1501 if (pos < expectedPos) {
1502 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1503 testFileName, lineNumber, pos);
1504 break;
1505 }
1506 if (pos > expectedPos) {
1507 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1508 testFileName, lineNumber, expectedPos);
73c04bcf
A
1509 break;
1510 }
46f4442e
A
1511 pos = bi->next();
1512 expectedI++;
1513 }
73c04bcf 1514
46f4442e
A
1515 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1516 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1517 testFileName, lineNumber, breakPositions->elementAti(expectedI));
73c04bcf 1518 }
46f4442e 1519}
73c04bcf 1520
73c04bcf 1521
73c04bcf
A
1522
1523#if !UCONFIG_NO_REGULAR_EXPRESSIONS
73c04bcf
A
1524//---------------------------------------------------------------------------------------
1525//
1526// classs RBBIMonkeyKind
1527//
1528// Monkey Test for Break Iteration
1529// Abstract interface class. Concrete derived classes independently
1530// implement the break rules for different iterator types.
1531//
1532// The Monkey Test itself uses doesn't know which type of break iterator it is
1533// testing, but works purely in terms of the interface defined here.
1534//
1535//---------------------------------------------------------------------------------------
1536class RBBIMonkeyKind {
1537public:
1538 // Return a UVector of UnicodeSets, representing the character classes used
1539 // for this type of iterator.
1540 virtual UVector *charClasses() = 0;
1541
1542 // Set the test text on which subsequent calls to next() will operate
1543 virtual void setText(const UnicodeString &s) = 0;
1544
1545 // Find the next break postion, starting from the prev break position, or from zero.
1546 // Return -1 after reaching end of string.
1547 virtual int32_t next(int32_t i) = 0;
1548
1549 virtual ~RBBIMonkeyKind();
1550 UErrorCode deferredStatus;
1551
1552
1553protected:
1554 RBBIMonkeyKind();
1555
1556private:
1557};
1558
1559RBBIMonkeyKind::RBBIMonkeyKind() {
1560 deferredStatus = U_ZERO_ERROR;
1561}
1562
1563RBBIMonkeyKind::~RBBIMonkeyKind() {
1564}
1565
1566
1567//----------------------------------------------------------------------------------------
1568//
1569// Random Numbers. Similar to standard lib rand() and srand()
1570// Not using library to
1571// 1. Get same results on all platforms.
1572// 2. Get access to current seed, to more easily reproduce failures.
1573//
1574//---------------------------------------------------------------------------------------
1575static uint32_t m_seed = 1;
1576
1577static uint32_t m_rand()
1578{
1579 m_seed = m_seed * 1103515245 + 12345;
1580 return (uint32_t)(m_seed/65536) % 32768;
1581}
1582
1583
1584//------------------------------------------------------------------------------------------
1585//
1586// class RBBICharMonkey Character (Grapheme Cluster) specific implementation
1587// of RBBIMonkeyKind.
1588//
1589//------------------------------------------------------------------------------------------
1590class RBBICharMonkey: public RBBIMonkeyKind {
1591public:
1592 RBBICharMonkey();
1593 virtual ~RBBICharMonkey();
1594 virtual UVector *charClasses();
1595 virtual void setText(const UnicodeString &s);
1596 virtual int32_t next(int32_t i);
1597private:
1598 UVector *fSets;
1599
1600 UnicodeSet *fCRLFSet;
1601 UnicodeSet *fControlSet;
1602 UnicodeSet *fExtendSet;
f3c0d7a5 1603 UnicodeSet *fZWJSet;
51004dcb 1604 UnicodeSet *fRegionalIndicatorSet;
46f4442e
A
1605 UnicodeSet *fPrependSet;
1606 UnicodeSet *fSpacingSet;
1607 UnicodeSet *fLSet;
1608 UnicodeSet *fVSet;
1609 UnicodeSet *fTSet;
1610 UnicodeSet *fLVSet;
1611 UnicodeSet *fLVTSet;
73c04bcf 1612 UnicodeSet *fHangulSet;
f3c0d7a5 1613 UnicodeSet *fExtendedPictSet;
f3c0d7a5 1614 UnicodeSet *fAnySet;
73c04bcf 1615
73c04bcf
A
1616 const UnicodeString *fText;
1617};
1618
1619
1620RBBICharMonkey::RBBICharMonkey() {
1621 UErrorCode status = U_ZERO_ERROR;
1622
1623 fText = NULL;
73c04bcf 1624
46f4442e 1625 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
f3c0d7a5
A
1626 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
1627 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
1628 fZWJSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
1629 fRegionalIndicatorSet =
1630 new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
46f4442e
A
1631 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
1632 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
1633 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
1634 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
1635 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
1636 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
1637 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
1638 fHangulSet = new UnicodeSet();
1639 fHangulSet->addAll(*fLSet);
1640 fHangulSet->addAll(*fVSet);
1641 fHangulSet->addAll(*fTSet);
1642 fHangulSet->addAll(*fLVSet);
1643 fHangulSet->addAll(*fLVTSet);
2ca993e8 1644
0f5d89e8 1645 fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
f3c0d7a5 1646 fAnySet = new UnicodeSet(0, 0x10ffff);
2ca993e8 1647
f3c0d7a5 1648 fSets = new UVector(status);
73c04bcf
A
1649 fSets->addElement(fCRLFSet, status);
1650 fSets->addElement(fControlSet, status);
1651 fSets->addElement(fExtendSet, status);
51004dcb 1652 fSets->addElement(fRegionalIndicatorSet, status);
4388f060
A
1653 if (!fPrependSet->isEmpty()) {
1654 fSets->addElement(fPrependSet, status);
1655 }
46f4442e 1656 fSets->addElement(fSpacingSet, status);
73c04bcf
A
1657 fSets->addElement(fHangulSet, status);
1658 fSets->addElement(fAnySet, status);
2ca993e8 1659 fSets->addElement(fZWJSet, status);
f3c0d7a5 1660 fSets->addElement(fExtendedPictSet, status);
73c04bcf
A
1661 if (U_FAILURE(status)) {
1662 deferredStatus = status;
1663 }
1664}
1665
1666
1667void RBBICharMonkey::setText(const UnicodeString &s) {
1668 fText = &s;
73c04bcf
A
1669}
1670
1671
73c04bcf 1672
46f4442e
A
1673int32_t RBBICharMonkey::next(int32_t prevPos) {
1674 int p0, p1, p2, p3; // Indices of the significant code points around the
1675 // break position being tested. The candidate break
1676 // location is before p2.
1677
1678 int breakPos = -1;
1679
1680 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2ca993e8
A
1681 UChar32 cBase; // for (X Extend*) patterns, the X character.
1682
46f4442e
A
1683 if (U_FAILURE(deferredStatus)) {
1684 return -1;
73c04bcf 1685 }
46f4442e
A
1686
1687 // Previous break at end of string. return DONE.
1688 if (prevPos >= fText->length()) {
1689 return -1;
73c04bcf 1690 }
46f4442e
A
1691 p0 = p1 = p2 = p3 = prevPos;
1692 c3 = fText->char32At(prevPos);
2ca993e8 1693 c0 = c1 = c2 = cBase = 0;
57a6839d
A
1694 (void)p0; // suppress set but not used warning.
1695 (void)c0;
46f4442e
A
1696
1697 // Loop runs once per "significant" character position in the input text.
1698 for (;;) {
1699 // Move all of the positions forward in the input string.
1700 p0 = p1; c0 = c1;
1701 p1 = p2; c1 = c2;
1702 p2 = p3; c2 = c3;
1703
1704 // Advancd p3 by one codepoint
1705 p3 = fText->moveIndex32(p3, 1);
1706 c3 = fText->char32At(p3);
1707
1708 if (p1 == p2) {
1709 // Still warming up the loop. (won't work with zero length strings, but we don't care)
1710 continue;
1711 }
1712 if (p2 == fText->length()) {
1713 // Reached end of string. Always a break position.
1714 break;
1715 }
1716
1717 // Rule GB3 CR x LF
1718 // No Extend or Format characters may appear between the CR and LF,
1719 // which requires the additional check for p2 immediately following p1.
1720 //
1721 if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
1722 continue;
1723 }
1724
1725 // Rule (GB4). ( Control | CR | LF ) <break>
1726 if (fControlSet->contains(c1) ||
1727 c1 == 0x0D ||
1728 c1 == 0x0A) {
1729 break;
1730 }
1731
1732 // Rule (GB5) <break> ( Control | CR | LF )
1733 //
1734 if (fControlSet->contains(c2) ||
1735 c2 == 0x0D ||
1736 c2 == 0x0A) {
1737 break;
1738 }
1739
1740
1741 // Rule (GB6) L x ( L | V | LV | LVT )
1742 if (fLSet->contains(c1) &&
1743 (fLSet->contains(c2) ||
1744 fVSet->contains(c2) ||
1745 fLVSet->contains(c2) ||
1746 fLVTSet->contains(c2))) {
1747 continue;
1748 }
1749
1750 // Rule (GB7) ( LV | V ) x ( V | T )
1751 if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
1752 (fVSet->contains(c2) || fTSet->contains(c2))) {
1753 continue;
1754 }
1755
1756 // Rule (GB8) ( LVT | T) x T
1757 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
1758 fTSet->contains(c2)) {
1759 continue;
1760 }
1761
2ca993e8
A
1762 // Rule (GB9) x (Extend | ZWJ)
1763 if (fExtendSet->contains(c2) || fZWJSet->contains(c2)) {
1764 if (!fExtendSet->contains(c1)) {
1765 cBase = c1;
1766 }
46f4442e
A
1767 continue;
1768 }
1769
1770 // Rule (GB9a) x SpacingMark
1771 if (fSpacingSet->contains(c2)) {
1772 continue;
1773 }
1774
1775 // Rule (GB9b) Prepend x
1776 if (fPrependSet->contains(c1)) {
1777 continue;
1778 }
1779
0f5d89e8
A
1780 // Rule (GB11) Extended_Pictographic Extend * ZWJ x Extended_Pictographic
1781 if (fExtendedPictSet->contains(cBase) && fZWJSet->contains(c1) && fExtendedPictSet->contains(c2)) {
2ca993e8
A
1782 continue;
1783 }
1784
1785 // Rule (GB12-13) Regional_Indicator x Regional_Indicator
1786 // Note: The first if condition is a little tricky. We only need to force
1787 // a break if there are three or more contiguous RIs. If there are
1788 // only two, a break following will occur via other rules, and will include
1789 // any trailing extend characters, which is needed behavior.
0f5d89e8 1790 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
2ca993e8
A
1791 && fRegionalIndicatorSet->contains(c2)) {
1792 break;
1793 }
1794 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
1795 continue;
1796 }
1797
1798 // Rule (GB999) Any <break> Any
46f4442e
A
1799 break;
1800 }
1801
1802 breakPos = p2;
1803 return breakPos;
73c04bcf
A
1804}
1805
1806
46f4442e 1807
73c04bcf
A
1808UVector *RBBICharMonkey::charClasses() {
1809 return fSets;
1810}
1811
1812
1813RBBICharMonkey::~RBBICharMonkey() {
1814 delete fSets;
1815 delete fCRLFSet;
1816 delete fControlSet;
1817 delete fExtendSet;
51004dcb 1818 delete fRegionalIndicatorSet;
46f4442e
A
1819 delete fPrependSet;
1820 delete fSpacingSet;
1821 delete fLSet;
1822 delete fVSet;
1823 delete fTSet;
1824 delete fLVSet;
1825 delete fLVTSet;
73c04bcf
A
1826 delete fHangulSet;
1827 delete fAnySet;
2ca993e8 1828 delete fZWJSet;
f3c0d7a5 1829 delete fExtendedPictSet;
73c04bcf
A
1830}
1831
1832//------------------------------------------------------------------------------------------
1833//
1834// class RBBIWordMonkey Word Break specific implementation
1835// of RBBIMonkeyKind.
1836//
1837//------------------------------------------------------------------------------------------
1838class RBBIWordMonkey: public RBBIMonkeyKind {
1839public:
1840 RBBIWordMonkey();
1841 virtual ~RBBIWordMonkey();
1842 virtual UVector *charClasses();
1843 virtual void setText(const UnicodeString &s);
1844 virtual int32_t next(int32_t i);
1845private:
1846 UVector *fSets;
1847
46f4442e
A
1848 UnicodeSet *fCRSet;
1849 UnicodeSet *fLFSet;
1850 UnicodeSet *fNewlineSet;
57a6839d 1851 UnicodeSet *fRegionalIndicatorSet;
73c04bcf 1852 UnicodeSet *fKatakanaSet;
57a6839d 1853 UnicodeSet *fHebrew_LetterSet;
73c04bcf 1854 UnicodeSet *fALetterSet;
57a6839d
A
1855 UnicodeSet *fSingle_QuoteSet;
1856 UnicodeSet *fDouble_QuoteSet;
46f4442e 1857 UnicodeSet *fMidNumLetSet;
73c04bcf
A
1858 UnicodeSet *fMidLetterSet;
1859 UnicodeSet *fMidNumSet;
1860 UnicodeSet *fNumericSet;
1861 UnicodeSet *fFormatSet;
1862 UnicodeSet *fOtherSet;
1863 UnicodeSet *fExtendSet;
1864 UnicodeSet *fExtendNumLetSet;
0f5d89e8 1865 UnicodeSet *fWSegSpaceSet;
f3c0d7a5 1866 UnicodeSet *fDictionarySet;
f3c0d7a5
A
1867 UnicodeSet *fZWJSet;
1868 UnicodeSet *fExtendedPictSet;
73c04bcf 1869
73c04bcf
A
1870 const UnicodeString *fText;
1871};
1872
1873
46f4442e 1874RBBIWordMonkey::RBBIWordMonkey()
73c04bcf
A
1875{
1876 UErrorCode status = U_ZERO_ERROR;
1877
73c04bcf
A
1878 fSets = new UVector(status);
1879
f3c0d7a5
A
1880 fCRSet = new UnicodeSet(u"[\\p{Word_Break = CR}]", status);
1881 fLFSet = new UnicodeSet(u"[\\p{Word_Break = LF}]", status);
1882 fNewlineSet = new UnicodeSet(u"[\\p{Word_Break = Newline}]", status);
1883 fKatakanaSet = new UnicodeSet(u"[\\p{Word_Break = Katakana}]", status);
1884 fRegionalIndicatorSet = new UnicodeSet(u"[\\p{Word_Break = Regional_Indicator}]", status);
1885 fHebrew_LetterSet = new UnicodeSet(u"[\\p{Word_Break = Hebrew_Letter}]", status);
1886 fALetterSet = new UnicodeSet(u"[\\p{Word_Break = ALetter}]", status);
1887 fSingle_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Single_Quote}]", status);
1888 fDouble_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Double_Quote}]", status);
1889 fMidNumLetSet = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]", status);
1890 fMidLetterSet = new UnicodeSet(u"[\\p{Word_Break = MidLetter} - [\\:]]", status);
1891 fMidNumSet = new UnicodeSet(u"[\\p{Word_Break = MidNum}]", status);
3d1f044b 1892 fNumericSet = new UnicodeSet(u"[[\\p{Word_Break = Numeric}][\\uff10-\\uff19]]", status);
f3c0d7a5
A
1893 fFormatSet = new UnicodeSet(u"[\\p{Word_Break = Format}]", status);
1894 fExtendNumLetSet = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
1895 fExtendSet = new UnicodeSet(u"[\\p{Word_Break = Extend}]", status);
0f5d89e8 1896 fWSegSpaceSet = new UnicodeSet(u"[\\p{Word_Break = WSegSpace}]", status);
f3c0d7a5 1897
f3c0d7a5 1898 fZWJSet = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]", status);
0f5d89e8 1899 fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
f3c0d7a5
A
1900
1901 fDictionarySet = new UnicodeSet(u"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
1902 fDictionarySet->addAll(*fKatakanaSet);
1903 fDictionarySet->addAll(UnicodeSet(u"[\\p{LineBreak = Complex_Context}]", status));
1904
1905 fALetterSet->removeAll(*fDictionarySet);
2ca993e8 1906
73c04bcf
A
1907 fOtherSet = new UnicodeSet();
1908 if(U_FAILURE(status)) {
f3c0d7a5
A
1909 IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
1910 deferredStatus = status;
1911 return;
73c04bcf
A
1912 }
1913
1914 fOtherSet->complement();
46f4442e
A
1915 fOtherSet->removeAll(*fCRSet);
1916 fOtherSet->removeAll(*fLFSet);
1917 fOtherSet->removeAll(*fNewlineSet);
73c04bcf 1918 fOtherSet->removeAll(*fKatakanaSet);
57a6839d 1919 fOtherSet->removeAll(*fHebrew_LetterSet);
73c04bcf 1920 fOtherSet->removeAll(*fALetterSet);
57a6839d
A
1921 fOtherSet->removeAll(*fSingle_QuoteSet);
1922 fOtherSet->removeAll(*fDouble_QuoteSet);
73c04bcf
A
1923 fOtherSet->removeAll(*fMidLetterSet);
1924 fOtherSet->removeAll(*fMidNumSet);
1925 fOtherSet->removeAll(*fNumericSet);
1926 fOtherSet->removeAll(*fExtendNumLetSet);
0f5d89e8 1927 fOtherSet->removeAll(*fWSegSpaceSet);
73c04bcf
A
1928 fOtherSet->removeAll(*fFormatSet);
1929 fOtherSet->removeAll(*fExtendSet);
51004dcb 1930 fOtherSet->removeAll(*fRegionalIndicatorSet);
f3c0d7a5
A
1931 fOtherSet->removeAll(*fZWJSet);
1932 fOtherSet->removeAll(*fExtendedPictSet);
f3c0d7a5 1933
46f4442e 1934 // Inhibit dictionary characters from being tested at all.
f3c0d7a5 1935 fOtherSet->removeAll(*fDictionarySet);
73c04bcf 1936
57a6839d
A
1937 fSets->addElement(fCRSet, status);
1938 fSets->addElement(fLFSet, status);
1939 fSets->addElement(fNewlineSet, status);
51004dcb 1940 fSets->addElement(fRegionalIndicatorSet, status);
57a6839d
A
1941 fSets->addElement(fHebrew_LetterSet, status);
1942 fSets->addElement(fALetterSet, status);
1943 fSets->addElement(fSingle_QuoteSet, status);
1944 fSets->addElement(fDouble_QuoteSet, status);
f3c0d7a5
A
1945 //fSets->addElement(fKatakanaSet, status); // Omit Katakana from fSets, which omits Katakana characters
1946 // from the test data. They are all in the dictionary set,
1947 // which this (old, to be retired) monkey test cannot handle.
57a6839d
A
1948 fSets->addElement(fMidLetterSet, status);
1949 fSets->addElement(fMidNumLetSet, status);
1950 fSets->addElement(fMidNumSet, status);
1951 fSets->addElement(fNumericSet, status);
1952 fSets->addElement(fFormatSet, status);
1953 fSets->addElement(fExtendSet, status);
1954 fSets->addElement(fOtherSet, status);
1955 fSets->addElement(fExtendNumLetSet, status);
0f5d89e8 1956 fSets->addElement(fWSegSpaceSet, status);
73c04bcf 1957
f3c0d7a5
A
1958 fSets->addElement(fZWJSet, status);
1959 fSets->addElement(fExtendedPictSet, status);
2ca993e8 1960
73c04bcf
A
1961 if (U_FAILURE(status)) {
1962 deferredStatus = status;
1963 }
1964}
1965
1966void RBBIWordMonkey::setText(const UnicodeString &s) {
1967 fText = &s;
1968}
1969
1970
1971int32_t RBBIWordMonkey::next(int32_t prevPos) {
1972 int p0, p1, p2, p3; // Indices of the significant code points around the
1973 // break position being tested. The candidate break
1974 // location is before p2.
1975
1976 int breakPos = -1;
1977
1978 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2ca993e8 1979
46f4442e
A
1980 if (U_FAILURE(deferredStatus)) {
1981 return -1;
1982 }
73c04bcf
A
1983
1984 // Prev break at end of string. return DONE.
1985 if (prevPos >= fText->length()) {
1986 return -1;
1987 }
1988 p0 = p1 = p2 = p3 = prevPos;
1989 c3 = fText->char32At(prevPos);
1990 c0 = c1 = c2 = 0;
57a6839d 1991 (void)p0; // Suppress set but not used warning.
73c04bcf
A
1992
1993 // Loop runs once per "significant" character position in the input text.
1994 for (;;) {
1995 // Move all of the positions forward in the input string.
1996 p0 = p1; c0 = c1;
1997 p1 = p2; c1 = c2;
1998 p2 = p3; c2 = c3;
1999
2000 // Advancd p3 by X(Extend | Format)* Rule 4
46f4442e 2001 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
73c04bcf
A
2002 do {
2003 p3 = fText->moveIndex32(p3, 1);
2004 c3 = fText->char32At(p3);
46f4442e
A
2005 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2006 break;
2007 };
73c04bcf 2008 }
f3c0d7a5 2009 while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
73c04bcf
A
2010
2011
2012 if (p1 == p2) {
2013 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2014 continue;
2015 }
2016 if (p2 == fText->length()) {
2017 // Reached end of string. Always a break position.
2018 break;
2019 }
46f4442e 2020
73c04bcf
A
2021 // Rule (3) CR x LF
2022 // No Extend or Format characters may appear between the CR and LF,
2023 // which requires the additional check for p2 immediately following p1.
2024 //
46f4442e 2025 if (c1==0x0D && c2==0x0A) {
73c04bcf
A
2026 continue;
2027 }
2ca993e8 2028
46f4442e
A
2029 // Rule (3a) Break before and after newlines (including CR and LF)
2030 //
2031 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2032 break;
2033 };
2034 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2035 break;
2036 };
73c04bcf 2037
0f5d89e8 2038 // Rule (3c) ZWJ x Extended_Pictographic
2ca993e8
A
2039 // Not ignoring extend chars, so peek into input text to
2040 // get the potential ZWJ, the character immediately preceding c2.
2041 // Sloppy UChar32 indexing: p2-1 may reference trail half
2042 // but char32At will get the full code point.
0f5d89e8
A
2043 if (fZWJSet->contains(fText->char32At(p2-1)) && fExtendedPictSet->contains(c2)) {
2044 continue;
2045 }
2046
2047 // Rule (3d) Keep horizontal whitespace together.
2048 if (fWSegSpaceSet->contains(fText->char32At(p2-1)) && fWSegSpaceSet->contains(c2)) {
2ca993e8
A
2049 continue;
2050 }
2051
57a6839d
A
2052 // Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
2053 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2054 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
73c04bcf
A
2055 continue;
2056 }
2057
57a6839d 2058 // Rule (6) (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
73c04bcf 2059 //
57a6839d
A
2060 if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2061 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2062 (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2063 continue;
2064 }
2065
2066 // Rule (7) (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter)
2067 if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2068 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2069 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
73c04bcf
A
2070 continue;
2071 }
2072
57a6839d
A
2073 // Rule (7a) Hebrew_Letter x Single_Quote
2074 if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2075 continue;
2076 }
73c04bcf 2077
57a6839d
A
2078 // Rule (7b) Hebrew_Letter x Double_Quote Hebrew_Letter
2079 if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2080 continue;
2081 }
2082
2083 // Rule (7c) Hebrew_Letter Double_Quote x Hebrew_Letter
2084 if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
73c04bcf
A
2085 continue;
2086 }
2087
2088 // Rule (8) Numeric x Numeric
2089 if (fNumericSet->contains(c1) &&
2090 fNumericSet->contains(c2)) {
2091 continue;
2092 }
2093
57a6839d
A
2094 // Rule (9) (ALetter | Hebrew_Letter) x Numeric
2095 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
73c04bcf
A
2096 fNumericSet->contains(c2)) {
2097 continue;
2098 }
2099
57a6839d 2100 // Rule (10) Numeric x (ALetter | Hebrew_Letter)
73c04bcf 2101 if (fNumericSet->contains(c1) &&
57a6839d 2102 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
73c04bcf
A
2103 continue;
2104 }
2105
57a6839d 2106 // Rule (11) Numeric (MidNum | MidNumLet | Single_Quote) x Numeric
46f4442e 2107 if (fNumericSet->contains(c0) &&
57a6839d 2108 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
73c04bcf
A
2109 fNumericSet->contains(c2)) {
2110 continue;
2111 }
2112
57a6839d 2113 // Rule (12) Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
73c04bcf 2114 if (fNumericSet->contains(c1) &&
57a6839d 2115 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
73c04bcf
A
2116 fNumericSet->contains(c3)) {
2117 continue;
2118 }
2119
2120 // Rule (13) Katakana x Katakana
f3c0d7a5
A
2121 // Note: matches UAX 29 rules, but doesn't come into play for ICU because
2122 // all Katakana are handled by the dictionary breaker.
73c04bcf
A
2123 if (fKatakanaSet->contains(c1) &&
2124 fKatakanaSet->contains(c2)) {
2125 continue;
2126 }
2127
57a6839d
A
2128 // Rule 13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
2129 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
73c04bcf
A
2130 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2131 fExtendNumLetSet->contains(c2)) {
2132 continue;
51004dcb 2133 }
73c04bcf 2134
57a6839d 2135 // Rule 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
73c04bcf 2136 if (fExtendNumLetSet->contains(c1) &&
57a6839d
A
2137 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2138 fNumericSet->contains(c2) || fKatakanaSet->contains(c2))) {
2139 continue;
51004dcb
A
2140 }
2141
f3c0d7a5 2142 // Rule 15 - 17 Group pairs of Regional Indicators.
2ca993e8
A
2143 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
2144 break;
2145 }
51004dcb
A
2146 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2147 continue;
2148 }
73c04bcf 2149
f3c0d7a5 2150 // Rule 999. Break found here.
73c04bcf
A
2151 break;
2152 }
2153
2154 breakPos = p2;
2155 return breakPos;
2156}
2157
2158
2159UVector *RBBIWordMonkey::charClasses() {
2160 return fSets;
2161}
2162
2163
2164RBBIWordMonkey::~RBBIWordMonkey() {
2165 delete fSets;
46f4442e
A
2166 delete fCRSet;
2167 delete fLFSet;
2168 delete fNewlineSet;
73c04bcf 2169 delete fKatakanaSet;
57a6839d 2170 delete fHebrew_LetterSet;
73c04bcf 2171 delete fALetterSet;
57a6839d
A
2172 delete fSingle_QuoteSet;
2173 delete fDouble_QuoteSet;
46f4442e 2174 delete fMidNumLetSet;
73c04bcf
A
2175 delete fMidLetterSet;
2176 delete fMidNumSet;
2177 delete fNumericSet;
2178 delete fFormatSet;
2179 delete fExtendSet;
2180 delete fExtendNumLetSet;
0f5d89e8 2181 delete fWSegSpaceSet;
51004dcb 2182 delete fRegionalIndicatorSet;
f3c0d7a5 2183 delete fDictionarySet;
73c04bcf 2184 delete fOtherSet;
f3c0d7a5
A
2185 delete fZWJSet;
2186 delete fExtendedPictSet;
73c04bcf
A
2187}
2188
2189
2190
2191
2192//------------------------------------------------------------------------------------------
2193//
2194// class RBBISentMonkey Sentence Break specific implementation
2195// of RBBIMonkeyKind.
2196//
2197//------------------------------------------------------------------------------------------
2198class RBBISentMonkey: public RBBIMonkeyKind {
2199public:
2200 RBBISentMonkey();
2201 virtual ~RBBISentMonkey();
2202 virtual UVector *charClasses();
2203 virtual void setText(const UnicodeString &s);
2204 virtual int32_t next(int32_t i);
2205private:
2206 int moveBack(int posFrom);
2207 int moveForward(int posFrom);
2208 UChar32 cAt(int pos);
2209
2210 UVector *fSets;
2211
2212 UnicodeSet *fSepSet;
2213 UnicodeSet *fFormatSet;
2214 UnicodeSet *fSpSet;
2215 UnicodeSet *fLowerSet;
2216 UnicodeSet *fUpperSet;
2217 UnicodeSet *fOLetterSet;
2218 UnicodeSet *fNumericSet;
2219 UnicodeSet *fATermSet;
46f4442e 2220 UnicodeSet *fSContinueSet;
73c04bcf
A
2221 UnicodeSet *fSTermSet;
2222 UnicodeSet *fCloseSet;
2223 UnicodeSet *fOtherSet;
2224 UnicodeSet *fExtendSet;
2225
2226 const UnicodeString *fText;
2227
2228};
2229
2230RBBISentMonkey::RBBISentMonkey()
2231{
2232 UErrorCode status = U_ZERO_ERROR;
2233
2234 fSets = new UVector(status);
2235
46f4442e
A
2236 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
2237 // set and made into character classes of their own. For the monkey impl,
2238 // they remain in SEP, since Sep always appears with CR and LF in the rules.
2239 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status);
2240 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status);
2241 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status);
2242 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status);
2243 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status);
2244 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status);
2245 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status);
2246 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status);
2247 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2248 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status);
2249 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status);
2250 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status);
73c04bcf
A
2251 fOtherSet = new UnicodeSet();
2252
2253 if(U_FAILURE(status)) {
2254 deferredStatus = status;
2255 return;
2256 }
2257
2258 fOtherSet->complement();
2259 fOtherSet->removeAll(*fSepSet);
2260 fOtherSet->removeAll(*fFormatSet);
2261 fOtherSet->removeAll(*fSpSet);
2262 fOtherSet->removeAll(*fLowerSet);
2263 fOtherSet->removeAll(*fUpperSet);
2264 fOtherSet->removeAll(*fOLetterSet);
2265 fOtherSet->removeAll(*fNumericSet);
2266 fOtherSet->removeAll(*fATermSet);
46f4442e 2267 fOtherSet->removeAll(*fSContinueSet);
73c04bcf
A
2268 fOtherSet->removeAll(*fSTermSet);
2269 fOtherSet->removeAll(*fCloseSet);
2270 fOtherSet->removeAll(*fExtendSet);
2271
46f4442e
A
2272 fSets->addElement(fSepSet, status);
2273 fSets->addElement(fFormatSet, status);
2274 fSets->addElement(fSpSet, status);
2275 fSets->addElement(fLowerSet, status);
2276 fSets->addElement(fUpperSet, status);
2277 fSets->addElement(fOLetterSet, status);
2278 fSets->addElement(fNumericSet, status);
2279 fSets->addElement(fATermSet, status);
2280 fSets->addElement(fSContinueSet, status);
2281 fSets->addElement(fSTermSet, status);
2282 fSets->addElement(fCloseSet, status);
2283 fSets->addElement(fOtherSet, status);
2284 fSets->addElement(fExtendSet, status);
73c04bcf
A
2285
2286 if (U_FAILURE(status)) {
2287 deferredStatus = status;
2288 }
2289}
2290
2291
2292
2293void RBBISentMonkey::setText(const UnicodeString &s) {
2294 fText = &s;
2295}
2296
2297UVector *RBBISentMonkey::charClasses() {
2298 return fSets;
2299}
2300
2301
2302// moveBack() Find the "significant" code point preceding the index i.
2303// Skips over ($Extend | $Format)* .
46f4442e 2304//
73c04bcf
A
2305int RBBISentMonkey::moveBack(int i) {
2306 if (i <= 0) {
2307 return -1;
2308 }
2309 UChar32 c;
2310 int32_t j = i;
2311 do {
2312 j = fText->moveIndex32(j, -1);
2313 c = fText->char32At(j);
2314 }
2315 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2316 return j;
2317
2318 }
2319
2320
2321int RBBISentMonkey::moveForward(int i) {
2322 if (i>=fText->length()) {
2323 return fText->length();
2324 }
2325 UChar32 c;
2326 int32_t j = i;
2327 do {
2328 j = fText->moveIndex32(j, 1);
2329 c = cAt(j);
2330 }
2331 while (fFormatSet->contains(c) || fExtendSet->contains(c));
2332 return j;
2333}
2334
2335UChar32 RBBISentMonkey::cAt(int pos) {
2336 if (pos<0 || pos>=fText->length()) {
2337 return -1;
2338 } else {
2339 return fText->char32At(pos);
2340 }
2341}
2342
2343int32_t RBBISentMonkey::next(int32_t prevPos) {
2344 int p0, p1, p2, p3; // Indices of the significant code points around the
2345 // break position being tested. The candidate break
2346 // location is before p2.
2347
2348 int breakPos = -1;
2349
2350 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2351 UChar32 c;
2352
46f4442e
A
2353 if (U_FAILURE(deferredStatus)) {
2354 return -1;
2355 }
2356
73c04bcf
A
2357 // Prev break at end of string. return DONE.
2358 if (prevPos >= fText->length()) {
2359 return -1;
2360 }
2361 p0 = p1 = p2 = p3 = prevPos;
2362 c3 = fText->char32At(prevPos);
2363 c0 = c1 = c2 = 0;
57a6839d 2364 (void)p0; // Suppress set but not used warning.
73c04bcf
A
2365
2366 // Loop runs once per "significant" character position in the input text.
2367 for (;;) {
2368 // Move all of the positions forward in the input string.
2369 p0 = p1; c0 = c1;
2370 p1 = p2; c1 = c2;
2371 p2 = p3; c2 = c3;
46f4442e 2372
73c04bcf
A
2373 // Advancd p3 by X(Extend | Format)* Rule 4
2374 p3 = moveForward(p3);
2375 c3 = cAt(p3);
2376
2377 // Rule (3) CR x LF
2378 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2379 continue;
2380 }
46f4442e 2381
73c04bcf
A
2382 // Rule (4). Sep <break>
2383 if (fSepSet->contains(c1)) {
2384 p2 = p1+1; // Separators don't combine with Extend or Format.
2385 break;
2386 }
2387
2388 if (p2 >= fText->length()) {
2389 // Reached end of string. Always a break position.
2390 break;
2391 }
2392
2393 if (p2 == prevPos) {
2394 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2395 continue;
2396 }
46f4442e 2397
73c04bcf
A
2398 // Rule (6). ATerm x Numeric
2399 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) {
2400 continue;
2401 }
2402
2ca993e8
A
2403 // Rule (7). (Upper | Lower) ATerm x Uppper
2404 if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
2405 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
73c04bcf
A
2406 continue;
2407 }
2408
2409 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2410 // Note: STerm | ATerm are added to the negated part of the expression by a
2411 // note to the Unicode 5.0 documents.
2412 int p8 = p1;
2413 while (fSpSet->contains(cAt(p8))) {
2414 p8 = moveBack(p8);
2415 }
2416 while (fCloseSet->contains(cAt(p8))) {
2417 p8 = moveBack(p8);
2418 }
2419 if (fATermSet->contains(cAt(p8))) {
2420 p8=p2;
2421 for (;;) {
2422 c = cAt(p8);
2423 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2424 fLowerSet->contains(c) || fSepSet->contains(c) ||
2425 fATermSet->contains(c) || fSTermSet->contains(c)) {
2426 break;
2427 }
2428 p8 = moveForward(p8);
2429 }
2430 if (fLowerSet->contains(cAt(p8))) {
2431 continue;
2432 }
2433 }
46f4442e
A
2434
2435 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2436 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
73c04bcf
A
2437 p8 = p1;
2438 while (fSpSet->contains(cAt(p8))) {
2439 p8 = moveBack(p8);
2440 }
2441 while (fCloseSet->contains(cAt(p8))) {
2442 p8 = moveBack(p8);
2443 }
2444 c = cAt(p8);
2445 if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2446 continue;
2447 }
2448 }
2449
46f4442e 2450 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)
73c04bcf
A
2451 int p9 = p1;
2452 while (fCloseSet->contains(cAt(p9))) {
2453 p9 = moveBack(p9);
2454 }
2455 c = cAt(p9);
2456 if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2457 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2458 continue;
2459 }
2460 }
2461
46f4442e 2462 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)
73c04bcf
A
2463 int p10 = p1;
2464 while (fSpSet->contains(cAt(p10))) {
2465 p10 = moveBack(p10);
2466 }
2467 while (fCloseSet->contains(cAt(p10))) {
2468 p10 = moveBack(p10);
2469 }
2470 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2471 if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2472 continue;
2473 }
2474 }
2475
46f4442e 2476 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break>
73c04bcf 2477 int p11 = p1;
46f4442e
A
2478 if (fSepSet->contains(cAt(p11))) {
2479 p11 = moveBack(p11);
2480 }
73c04bcf
A
2481 while (fSpSet->contains(cAt(p11))) {
2482 p11 = moveBack(p11);
2483 }
2484 while (fCloseSet->contains(cAt(p11))) {
2485 p11 = moveBack(p11);
2486 }
2487 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2488 break;
2489 }
2490
2491 // Rule (12) Any x Any
2492 continue;
2493 }
2494 breakPos = p2;
2495 return breakPos;
2496}
2497
2498RBBISentMonkey::~RBBISentMonkey() {
2499 delete fSets;
2500 delete fSepSet;
2501 delete fFormatSet;
2502 delete fSpSet;
2503 delete fLowerSet;
2504 delete fUpperSet;
2505 delete fOLetterSet;
2506 delete fNumericSet;
2507 delete fATermSet;
46f4442e 2508 delete fSContinueSet;
73c04bcf
A
2509 delete fSTermSet;
2510 delete fCloseSet;
2511 delete fOtherSet;
2512 delete fExtendSet;
2513}
2514
2515
2516
2517//-------------------------------------------------------------------------------------------
2518//
2519// RBBILineMonkey
2520//
2521//-------------------------------------------------------------------------------------------
2522
2523class RBBILineMonkey: public RBBIMonkeyKind {
2524public:
2525 RBBILineMonkey();
2526 virtual ~RBBILineMonkey();
2527 virtual UVector *charClasses();
2528 virtual void setText(const UnicodeString &s);
2529 virtual int32_t next(int32_t i);
2530 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2531private:
2532 UVector *fSets;
2533
2534 UnicodeSet *fBK;
2535 UnicodeSet *fCR;
2536 UnicodeSet *fLF;
2537 UnicodeSet *fCM;
2538 UnicodeSet *fNL;
2539 UnicodeSet *fSG;
2540 UnicodeSet *fWJ;
2541 UnicodeSet *fZW;
2542 UnicodeSet *fGL;
2543 UnicodeSet *fCB;
2544 UnicodeSet *fSP;
2545 UnicodeSet *fB2;
2546 UnicodeSet *fBA;
2547 UnicodeSet *fBB;
3d1f044b 2548 UnicodeSet *fHH;
73c04bcf
A
2549 UnicodeSet *fHY;
2550 UnicodeSet *fH2;
2551 UnicodeSet *fH3;
2552 UnicodeSet *fCL;
729e4ab9 2553 UnicodeSet *fCP;
73c04bcf
A
2554 UnicodeSet *fEX;
2555 UnicodeSet *fIN;
2556 UnicodeSet *fJL;
2557 UnicodeSet *fJV;
2558 UnicodeSet *fJT;
2559 UnicodeSet *fNS;
2560 UnicodeSet *fOP;
2561 UnicodeSet *fQU;
2562 UnicodeSet *fIS;
2563 UnicodeSet *fNU;
2564 UnicodeSet *fPO;
2565 UnicodeSet *fPR;
2566 UnicodeSet *fSY;
2567 UnicodeSet *fAI;
2568 UnicodeSet *fAL;
4388f060
A
2569 UnicodeSet *fCJ;
2570 UnicodeSet *fHL;
73c04bcf 2571 UnicodeSet *fID;
51004dcb 2572 UnicodeSet *fRI;
73c04bcf 2573 UnicodeSet *fXX;
2ca993e8
A
2574 UnicodeSet *fEB;
2575 UnicodeSet *fEM;
3d1f044b 2576 UnicodeSet *fZWJ;
73c04bcf 2577
57a6839d 2578 BreakIterator *fCharBI;
73c04bcf 2579 const UnicodeString *fText;
73c04bcf 2580 RegexMatcher *fNumberMatcher;
73c04bcf
A
2581};
2582
2ca993e8
A
2583RBBILineMonkey::RBBILineMonkey() :
2584 RBBIMonkeyKind(),
2585 fSets(NULL),
2586
2587 fCharBI(NULL),
2588 fText(NULL),
2589 fNumberMatcher(NULL)
73c04bcf 2590
73c04bcf 2591{
2ca993e8
A
2592 if (U_FAILURE(deferredStatus)) {
2593 return;
2594 }
2595
73c04bcf
A
2596 UErrorCode status = U_ZERO_ERROR;
2597
2598 fSets = new UVector(status);
2599
46f4442e
A
2600 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2601 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2602 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2603 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2604 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2605 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2606 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2607 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2608 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2609 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2610 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2611 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2612 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
3d1f044b 2613 fHH = new UnicodeSet();
46f4442e
A
2614 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2615 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2616 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2617 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
729e4ab9 2618 fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
46f4442e
A
2619 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2620 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2621 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2622 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2623 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2624 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2625 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
2626 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
2627 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2628 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2629 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2630 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2631 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2632 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2633 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
4388f060
A
2634 fCJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2635 fHL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
46f4442e 2636 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
51004dcb 2637 fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
46f4442e
A
2638 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2639 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
f3c0d7a5
A
2640 fEB = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=EB}] \\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F91D\\U0001F93C]"), status);
2641 fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
3d1f044b 2642 fZWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
73c04bcf
A
2643
2644 if (U_FAILURE(status)) {
2645 deferredStatus = status;
73c04bcf
A
2646 return;
2647 }
2648
2649 fAL->addAll(*fXX); // Default behavior for XX is identical to AL
2650 fAL->addAll(*fAI); // Default behavior for AI is identical to AL
73c04bcf
A
2651 fAL->addAll(*fSG); // Default behavior for SG is identical to AL.
2652
4388f060 2653 fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS.
3d1f044b
A
2654 fCM->addAll(*fZWJ); // ZWJ behaves as a CM.
2655
2656 fHH->add(u'\u2010'); // Hyphen, '‐'
2ca993e8 2657
73c04bcf
A
2658 fSets->addElement(fBK, status);
2659 fSets->addElement(fCR, status);
2660 fSets->addElement(fLF, status);
2661 fSets->addElement(fCM, status);
2662 fSets->addElement(fNL, status);
2663 fSets->addElement(fWJ, status);
2664 fSets->addElement(fZW, status);
2665 fSets->addElement(fGL, status);
2666 fSets->addElement(fCB, status);
2667 fSets->addElement(fSP, status);
2668 fSets->addElement(fB2, status);
2669 fSets->addElement(fBA, status);
2670 fSets->addElement(fBB, status);
2671 fSets->addElement(fHY, status);
2672 fSets->addElement(fH2, status);
2673 fSets->addElement(fH3, status);
2674 fSets->addElement(fCL, status);
729e4ab9 2675 fSets->addElement(fCP, status);
73c04bcf
A
2676 fSets->addElement(fEX, status);
2677 fSets->addElement(fIN, status);
2678 fSets->addElement(fJL, status);
2679 fSets->addElement(fJT, status);
2680 fSets->addElement(fJV, status);
2681 fSets->addElement(fNS, status);
2682 fSets->addElement(fOP, status);
2683 fSets->addElement(fQU, status);
2684 fSets->addElement(fIS, status);
2685 fSets->addElement(fNU, status);
2686 fSets->addElement(fPO, status);
2687 fSets->addElement(fPR, status);
2688 fSets->addElement(fSY, status);
2689 fSets->addElement(fAI, status);
2690 fSets->addElement(fAL, status);
4388f060 2691 fSets->addElement(fHL, status);
73c04bcf
A
2692 fSets->addElement(fID, status);
2693 fSets->addElement(fWJ, status);
51004dcb 2694 fSets->addElement(fRI, status);
73c04bcf 2695 fSets->addElement(fSG, status);
2ca993e8
A
2696 fSets->addElement(fEB, status);
2697 fSets->addElement(fEM, status);
3d1f044b 2698 fSets->addElement(fZWJ, status);
f3c0d7a5 2699
73c04bcf 2700
2ca993e8 2701 const char *rules =
f3c0d7a5
A
2702 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
2703 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
3d1f044b 2704 "((\\p{Line_Break=IS})(\\p{Line_Break=CM}|\\u200d)*)?"
f3c0d7a5
A
2705 "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
2706 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
2707 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
2708 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
46f4442e 2709
73c04bcf 2710 fNumberMatcher = new RegexMatcher(
46f4442e 2711 UnicodeString(rules, -1, US_INV), 0, status);
73c04bcf
A
2712
2713 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2714
2715 if (U_FAILURE(status)) {
2716 deferredStatus = status;
2717 }
2718}
2719
2720
2721void RBBILineMonkey::setText(const UnicodeString &s) {
2722 fText = &s;
2723 fCharBI->setText(s);
2724 fNumberMatcher->reset(s);
2725}
2726
2727//
2728// rule9Adjust
2729// Line Break TR rules 9 and 10 implementation.
2730// This deals with combining marks and other sequences that
2731// that must be treated as if they were something other than what they actually are.
2732//
2733// This is factored out into a separate function because it must be applied twice for
2734// each potential break, once to the chars before the position being checked, then
2735// again to the text following the possible break.
2736//
2737void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2738 if (pos == -1) {
2739 // Invalid initial position. Happens during the warmup iteration of the
2740 // main loop in next().
2741 return;
2742 }
2743
2744 int32_t nPos = *nextPos;
2745
2746 // LB 9 Keep combining sequences together.
2747 // advance over any CM class chars. Note that Line Break CM is different
2748 // from the normal Grapheme Extend property.
2749 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
2750 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
2751 for (;;) {
2752 *nextChar = fText->char32At(nPos);
2753 if (!fCM->contains(*nextChar)) {
2754 break;
2755 }
2756 nPos = fText->moveIndex32(nPos, 1);
2757 }
2758 }
2759
2760
2761 // LB 9 Treat X CM* as if it were x.
2762 // No explicit action required.
2763
2764 // LB 10 Treat any remaining combining mark as AL
2765 if (fCM->contains(*posChar)) {
f3c0d7a5 2766 *posChar = u'A';
73c04bcf
A
2767 }
2768
2769 // Push the updated nextPos and nextChar back to our caller.
2770 // This only makes a difference if posChar got bigger by consuming a
2771 // combining sequence.
2772 *nextPos = nPos;
2773 *nextChar = fText->char32At(nPos);
2774}
2775
2776
2777
2778int32_t RBBILineMonkey::next(int32_t startPos) {
2779 UErrorCode status = U_ZERO_ERROR;
2780 int32_t pos; // Index of the char following a potential break position
2781 UChar32 thisChar; // Character at above position "pos"
2782
2783 int32_t prevPos; // Index of the char preceding a potential break position
2784 UChar32 prevChar; // Character at above position. Note that prevChar
2785 // and thisChar may not be adjacent because combining
2786 // characters between them will be ignored.
2787
4388f060
A
2788 int32_t prevPosX2; // Second previous character. Wider context for LB21a.
2789 UChar32 prevCharX2;
2790
73c04bcf
A
2791 int32_t nextPos; // Index of the next character following pos.
2792 // Usually skips over combining marks.
2793 int32_t nextCPPos; // Index of the code point following "pos."
2794 // May point to a combining mark.
2795 int32_t tPos; // temp value.
2796 UChar32 c;
2797
46f4442e
A
2798 if (U_FAILURE(deferredStatus)) {
2799 return -1;
2800 }
2801
73c04bcf
A
2802 if (startPos >= fText->length()) {
2803 return -1;
2804 }
2805
2806
2807 // Initial values for loop. Loop will run the first time without finding breaks,
2808 // while the invalid values shift out and the "this" and
2809 // "prev" positions are filled in with good values.
4388f060
A
2810 pos = prevPos = prevPosX2 = -1; // Invalid value, serves as flag for initial loop iteration.
2811 thisChar = prevChar = prevCharX2 = 0;
73c04bcf
A
2812 nextPos = nextCPPos = startPos;
2813
2814
2815 // Loop runs once per position in the test text, until a break position
2816 // is found.
2817 for (;;) {
4388f060
A
2818 prevPosX2 = prevPos;
2819 prevCharX2 = prevChar;
2820
73c04bcf
A
2821 prevPos = pos;
2822 prevChar = thisChar;
2823
2824 pos = nextPos;
2825 thisChar = fText->char32At(pos);
2826
2827 nextCPPos = fText->moveIndex32(pos, 1);
2828 nextPos = nextCPPos;
2829
2830 // Rule LB2 - Break at end of text.
2831 if (pos >= fText->length()) {
2832 break;
2833 }
2834
2835 // Rule LB 9 - adjust for combining sequences.
2836 // We do this one out-of-order because the adjustment does not change anything
2837 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
2838 // be applied.
2839 rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
2840 nextCPPos = nextPos = fText->moveIndex32(pos, 1);
2841 c = fText->char32At(nextPos);
2842 rule9Adjust(pos, &thisChar, &nextPos, &c);
2843
2844 // If the loop is still warming up - if we haven't shifted the initial
2845 // -1 positions out of prevPos yet - loop back to advance the
2846 // position in the input without any further looking for breaks.
2847 if (prevPos == -1) {
2848 continue;
2849 }
46f4442e 2850
73c04bcf
A
2851 // LB 4 Always break after hard line breaks,
2852 if (fBK->contains(prevChar)) {
2853 break;
2854 }
2855
2856 // LB 5 Break after CR, LF, NL, but not inside CR LF
2857 if (prevChar == 0x0d && thisChar == 0x0a) {
2858 continue;
2859 }
2860 if (prevChar == 0x0d ||
2861 prevChar == 0x0a ||
2862 prevChar == 0x85) {
2863 break;
2864 }
2865
2866 // LB 6 Don't break before hard line breaks
2867 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
2868 fBK->contains(thisChar)) {
2869 continue;
2870 }
2871
2872
2873 // LB 7 Don't break before spaces or zero-width space.
2874 if (fSP->contains(thisChar)) {
2875 continue;
2876 }
2877
2878 if (fZW->contains(thisChar)) {
2879 continue;
2880 }
2881
2882 // LB 8 Break after zero width space
3d1f044b
A
2883 // ZW SP* ÷
2884 // Scan backwards from prevChar for SP* ZW
2885 tPos = prevPos;
2886 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
2887 tPos = fText->moveIndex32(tPos, -1);
2888 }
2889 if (fZW->contains(fText->char32At(tPos))) {
73c04bcf
A
2890 break;
2891 }
2892
0f5d89e8
A
2893 // LB 25 Numbers
2894 // Move this test up, before LB8a, because numbers can match a longer sequence that would
2895 // also match 8a. e.g. NU ZWJ IS PO (ZWJ acts like CM)
2896 if (fNumberMatcher->lookingAt(prevPos, status)) {
2897 if (U_FAILURE(status)) {
2898 break;
2899 }
2900 // Matched a number. But could have been just a single digit, which would
2901 // not represent a "no break here" between prevChar and thisChar
2902 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num
2903 if (numEndIdx > pos) {
2904 // Number match includes at least our two chars being checked
2905 if (numEndIdx > nextPos) {
2906 // Number match includes additional chars. Update pos and nextPos
2907 // so that next loop iteration will continue at the end of the number,
2908 // checking for breaks between last char in number & whatever follows.
2909 pos = nextPos = numEndIdx;
2910 do {
2911 pos = fText->moveIndex32(pos, -1);
2912 thisChar = fText->char32At(pos);
2913 } while (fCM->contains(thisChar));
2914 }
2915 continue;
2916 }
2917 }
2918
2919 // LB 8a ZWJ x
2ca993e8
A
2920 // The monkey test's way of ignoring combining characters doesn't work
2921 // for this rule. ZJ is also a CM. Need to get the actual character
2922 // preceding "thisChar", not ignoring combining marks, possibly ZJ.
2923 {
2924 int32_t prevIdx = fText->moveIndex32(pos, -1);
2925 UChar32 prevC = fText->char32At(prevIdx);
3d1f044b 2926 if (fZWJ->contains(prevC)) {
2ca993e8
A
2927 continue;
2928 }
2929 }
2930
73c04bcf
A
2931 // LB 9, 10 Already done, at top of loop.
2932 //
2933
2934
2935 // LB 11 Do not break before or after WORD JOINER and related characters.
2936 // x WJ
2937 // WJ x
2938 //
2939 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
2940 continue;
2941 }
2942
2943 // LB 12
73c04bcf 2944 // GL x
46f4442e 2945 if (fGL->contains(prevChar)) {
73c04bcf
A
2946 continue;
2947 }
2ca993e8 2948
46f4442e
A
2949 // LB 12a
2950 // [^SP BA HY] x GL
2951 if (!(fSP->contains(prevChar) ||
2952 fBA->contains(prevChar) ||
2953 fHY->contains(prevChar) ) && fGL->contains(thisChar)) {
2954 continue;
2955 }
2956
73c04bcf 2957 // LB 13 Don't break before closings.
73c04bcf 2958 //
3d1f044b
A
2959 if (fCL->contains(thisChar) ||
2960 fCP->contains(thisChar) ||
2961 fEX->contains(thisChar) ||
2962 fSY->contains(thisChar)) {
73c04bcf
A
2963 continue;
2964 }
2965
2966 // LB 14 Don't break after OP SP*
2967 // Scan backwards, checking for this sequence.
2968 // The OP char could include combining marks, so we actually check for
2969 // OP CM* SP*
3d1f044b 2970 // Another Twist: The Rule 9 fixes may have changed a SP CM
73c04bcf
A
2971 // sequence into a ID char, so before scanning back through spaces,
2972 // verify that prevChar is indeed a space. The prevChar variable
2973 // may differ from fText[prevPos]
2974 tPos = prevPos;
2975 if (fSP->contains(prevChar)) {
2976 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
2977 tPos=fText->moveIndex32(tPos, -1);
2978 }
2979 }
2980 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
2981 tPos=fText->moveIndex32(tPos, -1);
2982 }
2983 if (fOP->contains(fText->char32At(tPos))) {
2984 continue;
2985 }
2986
2987
3d1f044b
A
2988 // LB 14a Break before an IS that begins a number and follows a space
2989 if (nextPos < fText->length()) {
2990 // note: UnicodeString::char32At(length) returns ffff, not distinguishable
2991 // from a legit ffff character. So test length separately.
2992 UChar32 nextChar = fText->char32At(nextPos);
2993 if (fSP->contains(prevChar) && fIS->contains(thisChar) && fNU->contains(nextChar)) {
2994 break;
2995 }
2996 }
2997
2998 // LB14b Do not break before numeric separators, even after spaces.
2999 if (fIS->contains(thisChar)) {
3000 continue;
3001 }
3002
73c04bcf
A
3003 // LB 15 QU SP* x OP
3004 if (fOP->contains(thisChar)) {
3005 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3006 int tPos = prevPos;
3007 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3008 tPos = fText->moveIndex32(tPos, -1);
3009 }
3010 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3011 tPos = fText->moveIndex32(tPos, -1);
3012 }
3013 if (fQU->contains(fText->char32At(tPos))) {
3014 continue;
3015 }
3016 }
3017
3018
3019
729e4ab9
A
3020 // LB 16 (CL | CP) SP* x NS
3021 // Scan backwards for SP* CM* (CL | CP)
73c04bcf
A
3022 if (fNS->contains(thisChar)) {
3023 int tPos = prevPos;
3024 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3025 tPos = fText->moveIndex32(tPos, -1);
3026 }
3027 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3028 tPos = fText->moveIndex32(tPos, -1);
3029 }
729e4ab9 3030 if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
73c04bcf
A
3031 continue;
3032 }
3033 }
3034
3035
3036 // LB 17 B2 SP* x B2
3037 if (fB2->contains(thisChar)) {
3038 // Scan backwards, checking for the B2 CM* SP* sequence.
3039 tPos = prevPos;
3040 if (fSP->contains(prevChar)) {
3041 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3042 tPos=fText->moveIndex32(tPos, -1);
3043 }
3044 }
3045 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3046 tPos=fText->moveIndex32(tPos, -1);
3047 }
3048 if (fB2->contains(fText->char32At(tPos))) {
3049 continue;
3050 }
3051 }
3052
46f4442e 3053
73c04bcf
A
3054 // LB 18 break after space
3055 if (fSP->contains(prevChar)) {
3056 break;
3057 }
3058
3059 // LB 19
3060 // x QU
3061 // QU x
3062 if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3063 continue;
3064 }
3065
3066 // LB 20 Break around a CB
3067 if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3068 break;
3069 }
3070
3d1f044b
A
3071 // LB 20.09 Don't break between Hyphens and letters if a break precedes the hyphen.
3072 // Formerly this was a Finnish tailoring.
3073 // Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
3074 // ^($HY | $HH) $AL;
3075 if (fAL->contains(thisChar) && (fHY->contains(prevChar) || fHH->contains(prevChar)) &&
3076 prevPosX2 == -1) {
3077 continue;
3078 }
3079
73c04bcf
A
3080 // LB 21
3081 if (fBA->contains(thisChar) ||
3082 fHY->contains(thisChar) ||
3083 fNS->contains(thisChar) ||
3084 fBB->contains(prevChar) ) {
3085 continue;
3086 }
3087
4388f060
A
3088 // LB 21a
3089 // HL (HY | BA) x
2ca993e8 3090 if (fHL->contains(prevCharX2) &&
4388f060
A
3091 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3092 continue;
3093 }
3094
57a6839d
A
3095 // LB 21b
3096 // SY x HL
51004dcb
A
3097 if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3098 continue;
3099 }
3100
73c04bcf 3101 // LB 22
729e4ab9 3102 if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
2ca993e8 3103 (fEX->contains(prevChar) && fIN->contains(thisChar)) ||
4388f060 3104 (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
f3c0d7a5 3105 ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) && fIN->contains(thisChar)) ||
729e4ab9
A
3106 (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
3107 (fNU->contains(prevChar) && fIN->contains(thisChar)) ) {
73c04bcf
A
3108 continue;
3109 }
3110
3111
f3c0d7a5
A
3112 // LB 23 (AL | HL) x NU
3113 // NU x (AL | HL)
3114 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
3115 continue;
3116 }
3117 if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3118 continue;
3119 }
3120
3121 // LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
3122 // PR x (ID | EB | EM)
3123 // (ID | EB | EM) x PO
0f5d89e8 3124 if (fPR->contains(prevChar) &&
f3c0d7a5
A
3125 (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar))) {
3126 continue;
3127 }
0f5d89e8 3128 if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
f3c0d7a5 3129 fPO->contains(thisChar)) {
73c04bcf
A
3130 continue;
3131 }
3132
3133 // LB 24 Do not break between prefix and letters or ideographs.
f3c0d7a5
A
3134 // (PR | PO) x (AL | HL)
3135 // (AL | HL) x (PR | PO)
3136 if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
3137 (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3138 continue;
3139 }
3140 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
3141 (fPR->contains(thisChar) || fPO->contains(thisChar))) {
73c04bcf
A
3142 continue;
3143 }
46f4442e 3144
0f5d89e8 3145 // LB 25 numbers match, moved up, before LB 8a,
73c04bcf
A
3146
3147 // LB 26 Do not break a Korean syllable.
3148 if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3149 fJV->contains(thisChar) ||
3150 fH2->contains(thisChar) ||
3151 fH3->contains(thisChar))) {
3152 continue;
3153 }
3154
3155 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) &&
3156 (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3157 continue;
3158 }
3159
3160 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3161 fJT->contains(thisChar)) {
3162 continue;
3163 }
3164
3165 // LB 27 Treat a Korean Syllable Block the same as ID.
3166 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3167 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3168 fIN->contains(thisChar)) {
3169 continue;
3170 }
3171 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3172 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3173 fPO->contains(thisChar)) {
3174 continue;
3175 }
3176 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3177 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3178 continue;
3179 }
3180
3181
3182
46f4442e 3183 // LB 28 Do not break between alphabetics ("at").
4388f060 3184 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
73c04bcf
A
3185 continue;
3186 }
3187
3188 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g.").
4388f060 3189 if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
73c04bcf
A
3190 continue;
3191 }
3192
729e4ab9
A
3193 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3194 // (AL | NU) x OP
3195 // CP x (AL | NU)
4388f060 3196 if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
729e4ab9
A
3197 continue;
3198 }
4388f060 3199 if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
729e4ab9
A
3200 continue;
3201 }
3202
3d1f044b
A
3203 // LB30a RI RI ÷ RI
3204 // RI x RI
2ca993e8
A
3205 if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
3206 break;
3207 }
51004dcb 3208 if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3d1f044b
A
3209 // Two Regional Indicators have been paired.
3210 // Over-write the trailing one (thisChar) to prevent it from forming another pair with a
3211 // following RI. This is a hack.
3212 thisChar = -1;
51004dcb
A
3213 continue;
3214 }
3215
2ca993e8
A
3216 // LB30b Emoji Base x Emoji Modifier
3217 if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
3218 continue;
3219 }
3220
73c04bcf
A
3221 // LB 31 Break everywhere else
3222 break;
3223
3224 }
3225
3226 return pos;
3227}
3228
3229
3230UVector *RBBILineMonkey::charClasses() {
3231 return fSets;
3232}
3233
3234
3235RBBILineMonkey::~RBBILineMonkey() {
3236 delete fSets;
3237
3238 delete fBK;
3239 delete fCR;
3240 delete fLF;
3241 delete fCM;
3242 delete fNL;
3243 delete fWJ;
3244 delete fZW;
3245 delete fGL;
3246 delete fCB;
3247 delete fSP;
3248 delete fB2;
3249 delete fBA;
3250 delete fBB;
3d1f044b 3251 delete fHH;
73c04bcf
A
3252 delete fHY;
3253 delete fH2;
3254 delete fH3;
3255 delete fCL;
729e4ab9 3256 delete fCP;
73c04bcf
A
3257 delete fEX;
3258 delete fIN;
3259 delete fJL;
3260 delete fJV;
3261 delete fJT;
3262 delete fNS;
3263 delete fOP;
3264 delete fQU;
3265 delete fIS;
3266 delete fNU;
3267 delete fPO;
3268 delete fPR;
3269 delete fSY;
3270 delete fAI;
3271 delete fAL;
4388f060
A
3272 delete fCJ;
3273 delete fHL;
73c04bcf 3274 delete fID;
51004dcb 3275 delete fRI;
73c04bcf
A
3276 delete fSG;
3277 delete fXX;
2ca993e8
A
3278 delete fEB;
3279 delete fEM;
3d1f044b 3280 delete fZWJ;
73c04bcf
A
3281
3282 delete fCharBI;
3283 delete fNumberMatcher;
3284}
3285
3286
3287//-------------------------------------------------------------------------------------------
3288//
3289// TestMonkey
3290//
3291// params
3292// seed=nnnnn Random number starting seed.
3293// Setting the seed allows errors to be reproduced.
3294// loop=nnn Looping count. Controls running time.
3295// -1: run forever.
3296// 0 or greater: run length.
3297//
3298// type = char | word | line | sent | title
3299//
2ca993e8
A
3300// Example:
3301// intltest rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3302//
73c04bcf
A
3303//-------------------------------------------------------------------------------------------
3304
3305static int32_t getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3306 int32_t val = defaultVal;
3307 name.append(" *= *(-?\\d+)");
3308 UErrorCode status = U_ZERO_ERROR;
3309 RegexMatcher m(name, params, 0, status);
3310 if (m.find()) {
3311 // The param exists. Convert the string to an int.
3312 char valString[100];
3313 int32_t paramLength = m.end(1, status) - m.start(1, status);
3314 if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3315 paramLength = (int32_t)(sizeof(valString)-2);
3316 }
3317 params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3318 val = strtol(valString, NULL, 10);
3319
3320 // Delete this parameter from the params string.
3321 m.reset();
3322 params = m.replaceFirst("", status);
3323 }
3324 U_ASSERT(U_SUCCESS(status));
3325 return val;
3326}
3327#endif
3328
51004dcb 3329#if !UCONFIG_NO_REGULAR_EXPRESSIONS
73c04bcf
A
3330static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3331 BreakIterator *bi,
3332 int expected[],
3333 int expectedcount)
3334{
3335 int count = 0;
3336 int i = 0;
3337 int forward[50];
3338 bi->setText(ustr);
3339 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3340 forward[count] = i;
3341 if (count < expectedcount && expected[count] != i) {
0f5d89e8
A
3342 test->errln("%s:%d break forward test failed: expected %d but got %d",
3343 __FILE__, __LINE__, expected[count], i);
73c04bcf
A
3344 break;
3345 }
3346 count ++;
3347 }
3348 if (count != expectedcount) {
3349 printStringBreaks(ustr, expected, expectedcount);
0f5d89e8
A
3350 test->errln("%s:%d break forward test failed: missed %d match",
3351 __FILE__, __LINE__, expectedcount - count);
73c04bcf
A
3352 return;
3353 }
3354 // testing boundaries
3355 for (i = 1; i < expectedcount; i ++) {
3356 int j = expected[i - 1];
3357 if (!bi->isBoundary(j)) {
3358 printStringBreaks(ustr, expected, expectedcount);
0f5d89e8
A
3359 test->errln("%s:%d isBoundary() failed. Expected boundary at position %d",
3360 __FILE__, __LINE__, j);
73c04bcf
A
3361 return;
3362 }
3363 for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3364 if (bi->isBoundary(j)) {
3365 printStringBreaks(ustr, expected, expectedcount);
0f5d89e8
A
3366 test->errln("%s:%d isBoundary() failed. Not expecting boundary at position %d",
3367 __FILE__, __LINE__, j);
73c04bcf
A
3368 return;
3369 }
3370 }
3371 }
3372
3373 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3374 count --;
3375 if (forward[count] != i) {
51004dcb 3376 printStringBreaks(ustr, expected, expectedcount);
0f5d89e8
A
3377 test->errln("%s:%d happy break test previous() failed: expected %d but got %d",
3378 __FILE__, __LINE__, forward[count], i);
73c04bcf
A
3379 break;
3380 }
3381 }
3382 if (count != 0) {
3383 printStringBreaks(ustr, expected, expectedcount);
3384 test->errln("break test previous() failed: missed a match");
3385 return;
3386 }
3387
3388 // testing preceding
3389 for (i = 0; i < expectedcount - 1; i ++) {
3390 // int j = expected[i] + 1;
3391 int j = ustr.moveIndex32(expected[i], 1);
3392 for (; j <= expected[i + 1]; j ++) {
0f5d89e8
A
3393 int32_t expectedPreceding = expected[i];
3394 int32_t actualPreceding = bi->preceding(j);
3395 if (actualPreceding != expectedPreceding) {
73c04bcf 3396 printStringBreaks(ustr, expected, expectedcount);
0f5d89e8
A
3397 test->errln("%s:%d preceding(%d): expected %d, got %d",
3398 __FILE__, __LINE__, j, expectedPreceding, actualPreceding);
73c04bcf
A
3399 return;
3400 }
3401 }
3402 }
3403}
51004dcb 3404#endif
73c04bcf
A
3405
3406void RBBITest::TestWordBreaks(void)
3407{
3408#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3409
73c04bcf
A
3410 Locale locale("en");
3411 UErrorCode status = U_ZERO_ERROR;
3412 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3413 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
51004dcb
A
3414 // Replaced any C+J characters in a row with a random sequence of characters
3415 // of the same length to make our C+J segmentation not get in the way.
73c04bcf
A
3416 static const char *strlist[] =
3417 {
3418 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
51004dcb 3419 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
46f4442e 3420 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
73c04bcf 3421 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
51004dcb 3422 "\\uac00\\u3588\\u009c\\u0953\\u194b",
73c04bcf
A
3423 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3424 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
51004dcb 3425 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
73c04bcf
A
3426 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3427 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3428 "\\u2027\\U000e0067\\u0a47\\u00b7",
3429 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3430 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3431 "\\u0589\\U000e006e\\u0a42\\U000104a5",
51004dcb 3432 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
73c04bcf
A
3433 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3434 "\\u0027\\u11af\\U000e0057\\u0602",
3435 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3436 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3437 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3438 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
46f4442e 3439 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
73c04bcf
A
3440 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3441 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3442 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3443 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
51004dcb 3444 "\\u18f4\\U000e0049\\u20e7\\u2027",
73c04bcf
A
3445 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3446 "\\ua183\\u102d\\u0bec\\u003a",
3447 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3448 "\\u003a\\u0e57\\u0fad\\u002e",
3449 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3450 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3451 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3452 "\\u003a\\u0664\\u00b7\\u1fba",
3453 "\\u003b\\u0027\\u00b7\\u47a3",
51004dcb 3454 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
73c04bcf
A
3455 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3456 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3457 };
3458 int loop;
3459 if (U_FAILURE(status)) {
729e4ab9 3460 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
73c04bcf
A
3461 return;
3462 }
2ca993e8 3463 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
73c04bcf 3464 // printf("looping %d\n", loop);
46f4442e 3465 UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
73c04bcf
A
3466 // RBBICharMonkey monkey;
3467 RBBIWordMonkey monkey;
3468
3469 int expected[50];
3470 int expectedcount = 0;
3471
3472 monkey.setText(ustr);
3473 int i;
3474 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3475 expected[expectedcount ++] = i;
3476 }
3477
3478 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3479 }
3480 delete bi;
3481#endif
3482}
3483
3484void RBBITest::TestWordBoundary(void)
3485{
3486 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3487 Locale locale("en");
3488 UErrorCode status = U_ZERO_ERROR;
3489 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
0f5d89e8
A
3490 LocalPointer<BreakIterator> bi(BreakIterator::createWordInstance(locale, status), status);
3491 if (U_FAILURE(status)) {
3492 errcheckln(status, "%s:%d Creation of break iterator failed %s",
3493 __FILE__, __LINE__, u_errorName(status));
3494 return;
3495 }
73c04bcf
A
3496 UChar str[50];
3497 static const char *strlist[] =
3498 {
3499 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3500 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3501 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3502 "\\u2027\\U000e0067\\u0a47\\u00b7",
3503 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3504 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3505 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3506 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3507 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3508 "\\u0027\\u11af\\U000e0057\\u0602",
3509 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3510 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3511 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3512 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3513 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
51004dcb 3514 "\\U000e0065\\u302c\\u09ee\\U000e0068",
73c04bcf
A
3515 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3516 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3517 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3518 "\\u58f4\\U000e0049\\u20e7\\u2027",
51004dcb 3519 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
73c04bcf
A
3520 "\\ua183\\u102d\\u0bec\\u003a",
3521 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3522 "\\u003a\\u0e57\\u0fad\\u002e",
3523 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3524 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3525 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3526 "\\u003a\\u0664\\u00b7\\u1fba",
3527 "\\u003b\\u0027\\u00b7\\u47a3",
3528 };
3529 int loop;
2ca993e8 3530 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
0f5d89e8 3531 u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
73c04bcf
A
3532 UnicodeString ustr(str);
3533 int forward[50];
3534 int count = 0;
3535
3536 bi->setText(ustr);
0f5d89e8
A
3537 int prev = -1;
3538 for (int32_t boundary = bi->first(); boundary != BreakIterator::DONE; boundary = bi->next()) {
3539 ++count;
3540 if (count >= UPRV_LENGTHOF(forward)) {
3541 errln("%s:%d too many breaks found. (loop, count, boundary) = (%d, %d, %d)",
3542 __FILE__, __LINE__, loop, count, boundary);
3543 return;
3544 }
3545 forward[count] = boundary;
3546 if (boundary <= prev) {
3547 errln("%s:%d bi::next() did not advance. (loop, prev, boundary) = (%d, %d, %d)\n",
3548 __FILE__, __LINE__, loop, prev, boundary);
3549 break;
3550 }
3551 for (int32_t nonBoundary = prev + 1; nonBoundary < boundary; nonBoundary ++) {
3552 if (bi->isBoundary(nonBoundary)) {
3553 printStringBreaks(ustr, forward, count);
3554 errln("%s:%d isBoundary(nonBoundary) failed. (loop, prev, nonBoundary, boundary) = (%d, %d, %d, %d)",
3555 __FILE__, __LINE__, loop, prev, nonBoundary, boundary);
3556 return;
73c04bcf
A
3557 }
3558 }
0f5d89e8 3559 if (!bi->isBoundary(boundary)) {
73c04bcf 3560 printStringBreaks(ustr, forward, count);
0f5d89e8
A
3561 errln("%s:%d happy boundary test failed: expected %d a boundary",
3562 __FILE__, __LINE__, boundary);
73c04bcf
A
3563 return;
3564 }
0f5d89e8 3565 prev = boundary;
73c04bcf
A
3566 }
3567 }
73c04bcf
A
3568}
3569
3570void RBBITest::TestLineBreaks(void)
3571{
3572#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3573 Locale locale("en");
3574 UErrorCode status = U_ZERO_ERROR;
3575 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3576 const int32_t STRSIZE = 50;
3577 UChar str[STRSIZE];
3578 static const char *strlist[] =
3579 {
3580 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3581 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3582 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3583 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3584 "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3585 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3586 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3587 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3588 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3589 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
73c04bcf
A
3590 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3591 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3592 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3593 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3594 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3595 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3596 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3597 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3598 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3599 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3600 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3601 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3602 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3603 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3604 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
73c04bcf
A
3605 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3606 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3607 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3608 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3609 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
73c04bcf
A
3610 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3611 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
73c04bcf
A
3612 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3613 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3614 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3615 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3616 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3617 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
73c04bcf
A
3618 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3619 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3620 };
3621 int loop;
3622 TEST_ASSERT_SUCCESS(status);
3623 if (U_FAILURE(status)) {
3624 return;
3625 }
2ca993e8 3626 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
73c04bcf
A
3627 // printf("looping %d\n", loop);
3628 int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3629 if (t >= STRSIZE) {
3630 TEST_ASSERT(FALSE);
3631 continue;
3632 }
3633
46f4442e 3634
73c04bcf
A
3635 UnicodeString ustr(str);
3636 RBBILineMonkey monkey;
3637 if (U_FAILURE(monkey.deferredStatus)) {
3638 continue;
3639 }
3640
3641 const int EXPECTEDSIZE = 50;
3642 int expected[EXPECTEDSIZE];
3643 int expectedcount = 0;
3644
3645 monkey.setText(ustr);
3646 int i;
3647 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3648 if (expectedcount >= EXPECTEDSIZE) {
3649 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3650 return;
3651 }
3652 expected[expectedcount ++] = i;
3653 }
3654
3655 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3656 }
3657 delete bi;
3658#endif
3659}
3660
3661void RBBITest::TestSentBreaks(void)
3662{
3663#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3664 Locale locale("en");
3665 UErrorCode status = U_ZERO_ERROR;
3666 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3667 UChar str[200];
3668 static const char *strlist[] =
3669 {
3670 "Now\ris\nthe\r\ntime\n\rfor\r\r",
3671 "This\n",
3672 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3673 "\"Sentence ending with a quote.\" Bye.",
3674 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
3675 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3676 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3677 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3678 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3679 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3680 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3681 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3682 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3683 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3684 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3685 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3686 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3687 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3688 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3689 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3690 };
3691 int loop;
3692 if (U_FAILURE(status)) {
729e4ab9 3693 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
73c04bcf
A
3694 return;
3695 }
2ca993e8
A
3696 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3697 u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
73c04bcf
A
3698 UnicodeString ustr(str);
3699
3700 RBBISentMonkey monkey;
3701 if (U_FAILURE(monkey.deferredStatus)) {
3702 continue;
3703 }
3704
3705 const int EXPECTEDSIZE = 50;
3706 int expected[EXPECTEDSIZE];
3707 int expectedcount = 0;
3708
3709 monkey.setText(ustr);
3710 int i;
3711 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3712 if (expectedcount >= EXPECTEDSIZE) {
3713 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3714 return;
3715 }
3716 expected[expectedcount ++] = i;
3717 }
3718
3719 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3720 }
3721 delete bi;
3722#endif
3723}
3724
f3c0d7a5 3725void RBBITest::TestMonkey() {
73c04bcf
A
3726#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3727
3728 UErrorCode status = U_ZERO_ERROR;
3729 int32_t loopCount = 500;
3730 int32_t seed = 1;
3731 UnicodeString breakType = "all";
3732 Locale locale("en");
3733 UBool useUText = FALSE;
3734
3735 if (quick == FALSE) {
3736 loopCount = 10000;
3737 }
3738
f3c0d7a5
A
3739 if (fTestParams) {
3740 UnicodeString p(fTestParams);
73c04bcf
A
3741 loopCount = getIntParam("loop", p, loopCount);
3742 seed = getIntParam("seed", p, seed);
3743
3744 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3745 if (m.find()) {
3746 breakType = m.group(1, status);
3747 m.reset();
3748 p = m.replaceFirst("", status);
3749 }
3750
3751 RegexMatcher u(" *utext", p, 0, status);
3752 if (u.find()) {
3753 useUText = TRUE;
3754 u.reset();
3755 p = u.replaceFirst("", status);
3756 }
3757
3758
3759 // m.reset(p);
46f4442e 3760 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
73c04bcf
A
3761 // Each option is stripped out of the option string as it is processed.
3762 // All options have been checked. The option string should have been completely emptied..
3763 char buf[100];
3764 p.extract(buf, sizeof(buf), NULL, status);
3765 buf[sizeof(buf)-1] = 0;
3766 errln("Unrecognized or extra parameter: %s\n", buf);
3767 return;
3768 }
3769
3770 }
3771
3772 if (breakType == "char" || breakType == "all") {
3773 RBBICharMonkey m;
3774 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3775 if (U_SUCCESS(status)) {
3776 RunMonkey(bi, m, "char", seed, loopCount, useUText);
3777 if (breakType == "all" && useUText==FALSE) {
3778 // Also run a quick test with UText when "all" is specified
3779 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
3780 }
3781 }
3782 else {
729e4ab9 3783 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
73c04bcf
A
3784 }
3785 delete bi;
3786 }
3787
3788 if (breakType == "word" || breakType == "all") {
3789 logln("Word Break Monkey Test");
3790 RBBIWordMonkey m;
3791 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3792 if (U_SUCCESS(status)) {
3793 RunMonkey(bi, m, "word", seed, loopCount, useUText);
3794 }
3795 else {
729e4ab9 3796 errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
73c04bcf
A
3797 }
3798 delete bi;
3799 }
3800
3801 if (breakType == "line" || breakType == "all") {
3802 logln("Line Break Monkey Test");
3803 RBBILineMonkey m;
3804 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3805 if (loopCount >= 10) {
3806 loopCount = loopCount / 5; // Line break runs slower than the others.
3807 }
3808 if (U_SUCCESS(status)) {
3809 RunMonkey(bi, m, "line", seed, loopCount, useUText);
3810 }
3811 else {
729e4ab9 3812 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
73c04bcf
A
3813 }
3814 delete bi;
3815 }
3816
46f4442e 3817 if (breakType == "sent" || breakType == "all" ) {
73c04bcf
A
3818 logln("Sentence Break Monkey Test");
3819 RBBISentMonkey m;
3820 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3821 if (loopCount >= 10) {
3822 loopCount = loopCount / 10; // Sentence runs slower than the other break types
3823 }
3824 if (U_SUCCESS(status)) {
3825 RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
3826 }
3827 else {
729e4ab9 3828 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
73c04bcf
A
3829 }
3830 delete bi;
3831 }
3832
3833#endif
3834}
3835
3836//
3837// Run a RBBI monkey test. Common routine, for all break iterator types.
3838// Parameters:
3839// bi - the break iterator to use
3840// mk - MonkeyKind, abstraction for obtaining expected results
3841// name - Name of test (char, word, etc.) for use in error messages
3842// seed - Seed for starting random number generator (parameter from user)
3843// numIterations
3844//
46f4442e 3845void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed,
73c04bcf
A
3846 int32_t numIterations, UBool useUText) {
3847
3848#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3849
3850 const int32_t TESTSTRINGLEN = 500;
3851 UnicodeString testText;
3852 int32_t numCharClasses;
3853 UVector *chClasses;
3854 int expected[TESTSTRINGLEN*2 + 1];
3855 int expectedCount = 0;
3856 char expectedBreaks[TESTSTRINGLEN*2 + 1];
3857 char forwardBreaks[TESTSTRINGLEN*2 + 1];
3858 char reverseBreaks[TESTSTRINGLEN*2+1];
3859 char isBoundaryBreaks[TESTSTRINGLEN*2+1];
3860 char followingBreaks[TESTSTRINGLEN*2+1];
3861 char precedingBreaks[TESTSTRINGLEN*2+1];
3862 int i;
3863 int loopCount = 0;
3864
3865 m_seed = seed;
3866
3867 numCharClasses = mk.charClasses()->size();
3868 chClasses = mk.charClasses();
3869
3870 // Check for errors that occured during the construction of the MonkeyKind object.
3871 // Can't report them where they occured because errln() is a method coming from intlTest,
3872 // and is not visible outside of RBBITest :-(
3873 if (U_FAILURE(mk.deferredStatus)) {
3874 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
3875 return;
3876 }
3877
3878 // Verify that the character classes all have at least one member.
3879 for (i=0; i<numCharClasses; i++) {
3880 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
3881 if (s == NULL || s->size() == 0) {
3882 errln("Character Class #%d is null or of zero size.", i);
3883 return;
3884 }
3885 }
3886
3887 while (loopCount < numIterations || numIterations == -1) {
3888 if (numIterations == -1 && loopCount % 10 == 0) {
3889 // If test is running in an infinite loop, display a periodic tic so
3890 // we can tell that it is making progress.
3891 fprintf(stderr, ".");
3892 }
3893 // Save current random number seed, so that we can recreate the random numbers
3894 // for this loop iteration in event of an error.
3895 seed = m_seed;
3896
3897 // Populate a test string with data.
3898 testText.truncate(0);
3899 for (i=0; i<TESTSTRINGLEN; i++) {
3900 int32_t aClassNum = m_rand() % numCharClasses;
3901 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
3902 int32_t charIdx = m_rand() % classSet->size();
3903 UChar32 c = classSet->charAt(charIdx);
3904 if (c < 0) { // TODO: deal with sets containing strings.
2ca993e8 3905 errln("%s:%d c < 0", __FILE__, __LINE__);
73c04bcf
A
3906 break;
3907 }
2ca993e8
A
3908 // Do not assemble a supplementary character from randomly generated separate surrogates.
3909 // (It could be a dictionary character)
3910 if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
3911 continue;
3912 }
3913
73c04bcf
A
3914 testText.append(c);
3915 }
3916
3917 // Calculate the expected results for this test string.
3918 mk.setText(testText);
3919 memset(expectedBreaks, 0, sizeof(expectedBreaks));
3920 expectedBreaks[0] = 1;
3921 int32_t breakPos = 0;
3922 expectedCount = 0;
3923 for (;;) {
3924 breakPos = mk.next(breakPos);
3925 if (breakPos == -1) {
3926 break;
3927 }
3928 if (breakPos > testText.length()) {
3929 errln("breakPos > testText.length()");
3930 }
3931 expectedBreaks[breakPos] = 1;
3932 U_ASSERT(expectedCount<testText.length());
3933 expected[expectedCount ++] = breakPos;
57a6839d
A
3934 (void)expected; // Set but not used warning.
3935 // TODO (andy): check it out.
73c04bcf
A
3936 }
3937
3938 // Find the break positions using forward iteration
3939 memset(forwardBreaks, 0, sizeof(forwardBreaks));
3940 if (useUText) {
3941 UErrorCode status = U_ZERO_ERROR;
3942 UText *testUText = utext_openReplaceable(NULL, &testText, &status);
3943 // testUText = utext_openUnicodeString(testUText, &testText, &status);
3944 bi->setText(testUText, status);
3945 TEST_ASSERT_SUCCESS(status);
3946 utext_close(testUText); // The break iterator does a shallow clone of the UText
3947 // This UText can be closed immediately, so long as the
3948 // testText string continues to exist.
3949 } else {
3950 bi->setText(testText);
3951 }
3952
3953 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
3954 if (i < 0 || i > testText.length()) {
3955 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
3956 break;
3957 }
3958 forwardBreaks[i] = 1;
3959 }
3960
3961 // Find the break positions using reverse iteration
3962 memset(reverseBreaks, 0, sizeof(reverseBreaks));
3963 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
3964 if (i < 0 || i > testText.length()) {
3965 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
3966 break;
3967 }
3968 reverseBreaks[i] = 1;
3969 }
3970
3971 // Find the break positions using isBoundary() tests.
3972 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
3973 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
3974 for (i=0; i<=testText.length(); i++) {
3975 isBoundaryBreaks[i] = bi->isBoundary(i);
3976 }
3977
3978
3979 // Find the break positions using the following() function.
3980 // printf(".");
3981 memset(followingBreaks, 0, sizeof(followingBreaks));
3982 int32_t lastBreakPos = 0;
3983 followingBreaks[0] = 1;
3984 for (i=0; i<testText.length(); i++) {
3985 breakPos = bi->following(i);
3986 if (breakPos <= i ||
3987 breakPos < lastBreakPos ||
3988 breakPos > testText.length() ||
729e4ab9 3989 (breakPos > lastBreakPos && lastBreakPos > i)) {
73c04bcf
A
3990 errln("%s break monkey test: "
3991 "Out of range value returned by BreakIterator::following().\n"
3992 "Random seed=%d index=%d; following returned %d; lastbreak=%d",
3993 name, seed, i, breakPos, lastBreakPos);
3994 break;
3995 }
3996 followingBreaks[breakPos] = 1;
3997 lastBreakPos = breakPos;
3998 }
3999
4000 // Find the break positions using the preceding() function.
46f4442e 4001 memset(precedingBreaks, 0, sizeof(precedingBreaks));
73c04bcf
A
4002 lastBreakPos = testText.length();
4003 precedingBreaks[testText.length()] = 1;
4004 for (i=testText.length(); i>0; i--) {
4005 breakPos = bi->preceding(i);
4006 if (breakPos >= i ||
4007 breakPos > lastBreakPos ||
729e4ab9
A
4008 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4009 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
73c04bcf
A
4010 errln("%s break monkey test: "
4011 "Out of range value returned by BreakIterator::preceding().\n"
4012 "index=%d; prev returned %d; lastBreak=%d" ,
4013 name, i, breakPos, lastBreakPos);
46f4442e
A
4014 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4015 precedingBreaks[i] = 2; // Forces an error.
4016 }
73c04bcf 4017 } else {
46f4442e
A
4018 if (breakPos >= 0) {
4019 precedingBreaks[breakPos] = 1;
2ca993e8 4020 }
73c04bcf
A
4021 lastBreakPos = breakPos;
4022 }
4023 }
4024
4025 // Compare the expected and actual results.
4026 for (i=0; i<=testText.length(); i++) {
4027 const char *errorType = NULL;
4028 if (forwardBreaks[i] != expectedBreaks[i]) {
4029 errorType = "next()";
4030 } else if (reverseBreaks[i] != forwardBreaks[i]) {
4031 errorType = "previous()";
4032 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4033 errorType = "isBoundary()";
4034 } else if (followingBreaks[i] != expectedBreaks[i]) {
4035 errorType = "following()";
4036 } else if (precedingBreaks[i] != expectedBreaks[i]) {
4037 errorType = "preceding()";
4038 }
4039
4040
4041 if (errorType != NULL) {
4042 // Format a range of the test text that includes the failure as
4043 // a data item that can be included in the rbbi test data file.
4044
4045 // Start of the range is the last point where expected and actual results
4046 // both agreed that there was a break position.
4047 int startContext = i;
4048 int32_t count = 0;
4049 for (;;) {
4050 if (startContext==0) { break; }
4051 startContext --;
4052 if (expectedBreaks[startContext] != 0) {
4053 if (count == 2) break;
4054 count ++;
4055 }
4056 }
4057
4058 // End of range is two expected breaks past the start position.
4059 int endContext = i + 1;
4060 int ci;
4061 for (ci=0; ci<2; ci++) { // Number of items to include in error text.
4062 for (;;) {
4063 if (endContext >= testText.length()) {break;}
4064 if (expectedBreaks[endContext-1] != 0) {
4065 if (count == 0) break;
4066 count --;
4067 }
4068 endContext ++;
4069 }
4070 }
4071
4072 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4073 UnicodeString errorText = "<data>";
4074 /***if (strcmp(errorType, "next()") == 0) {
4075 startContext = 0;
4076 endContext = testText.length();
4077
4078 printStringBreaks(testText, expected, expectedCount);
4079 }***/
4080
4081 for (ci=startContext; ci<endContext;) {
4082 UnicodeString hexChars("0123456789abcdef");
4083 UChar32 c;
4084 int bn;
4085 c = testText.char32At(ci);
4086 if (ci == i) {
4087 // This is the location of the error.
4088 errorText.append("<?>");
4089 } else if (expectedBreaks[ci] != 0) {
4090 // This a non-error expected break position.
4091 errorText.append("\\");
4092 }
4093 if (c < 0x10000) {
4094 errorText.append("\\u");
4095 for (bn=12; bn>=0; bn-=4) {
4096 errorText.append(hexChars.charAt((c>>bn)&0xf));
4097 }
4098 } else {
4099 errorText.append("\\U");
4100 for (bn=28; bn>=0; bn-=4) {
4101 errorText.append(hexChars.charAt((c>>bn)&0xf));
4102 }
4103 }
4104 ci = testText.moveIndex32(ci, 1);
4105 }
4106 errorText.append("\\");
4107 errorText.append("</data>\n");
4108
4109 // Output the error
4110 char charErrorTxt[500];
4111 UErrorCode status = U_ZERO_ERROR;
4112 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4113 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4388f060 4114 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
2ca993e8 4115
4388f060
A
4116 errln("%s break monkey test error [%s]. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s",
4117 name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
73c04bcf
A
4118 errorType, seed, i, charErrorTxt);
4119 break;
4120 }
4121 }
4122
4123 loopCount++;
4124 }
4125#endif
4126}
4127
729e4ab9
A
4128
4129// Bug 5532. UTF-8 based UText fails in dictionary code.
4130// This test checks the initial patch,
4131// which is to just keep it from crashing. Correct word boundaries
4132// await a proper fix to the dictionary code.
4133//
4134void RBBITest::TestBug5532(void) {
4135 // Text includes a mixture of Thai and Latin.
4136 const unsigned char utf8Data[] = {
4137 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
2ca993e8 4138 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
729e4ab9
A
4139 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4140 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4141 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
2ca993e8
A
4142 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4143 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4144 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4145 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4146 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
729e4ab9
A
4147 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4148
4149 UErrorCode status = U_ZERO_ERROR;
4150 UText utext=UTEXT_INITIALIZER;
4151 utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4152 TEST_ASSERT_SUCCESS(status);
4153
4154 BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4155 TEST_ASSERT_SUCCESS(status);
4156 if (U_SUCCESS(status)) {
4157 bi->setText(&utext, status);
4158 TEST_ASSERT_SUCCESS(status);
4159
4160 int32_t breakCount = 0;
4161 int32_t previousBreak = -1;
4162 for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4163 // For now, just make sure that the break iterator doesn't hang.
4164 TEST_ASSERT(previousBreak < bi->current());
4165 previousBreak = bi->current();
4166 }
4167 TEST_ASSERT(breakCount > 0);
4168 }
4169 delete bi;
4170 utext_close(&utext);
4171}
4172
4173
51004dcb
A
4174void RBBITest::TestBug9983(void) {
4175 UnicodeString text = UnicodeString("\\u002A" // * Other
4176 "\\uFF65" // Other
4177 "\\u309C" // Katakana
4178 "\\uFF9F" // Extend
4179 "\\uFF65" // Other
4180 "\\u0020" // Other
4181 "\\u0000").unescape();
4182
4183 UErrorCode status = U_ZERO_ERROR;
4184 LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4185 BreakIterator::createWordInstance(Locale::getRoot(), status)));
4186 TEST_ASSERT_SUCCESS(status);
57a6839d
A
4187 LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4188 BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4189 TEST_ASSERT_SUCCESS(status);
51004dcb
A
4190 if (U_FAILURE(status)) {
4191 return;
4192 }
57a6839d
A
4193 int32_t offset, rstatus, iterationCount;
4194
51004dcb 4195 brkiter->setText(text);
51004dcb 4196 brkiter->last();
57a6839d 4197 iterationCount = 0;
51004dcb
A
4198 while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4199 iterationCount++;
4200 rstatus = brkiter->getRuleStatus();
57a6839d
A
4201 (void)rstatus; // Suppress set but not used warning.
4202 if (iterationCount >= 10) {
2ca993e8 4203 break;
57a6839d
A
4204 }
4205 }
4206 TEST_ASSERT(iterationCount == 6);
4207
4208 brkiterPOSIX->setText(text);
4209 brkiterPOSIX->last();
4210 iterationCount = 0;
4211 while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4212 iterationCount++;
4213 rstatus = brkiterPOSIX->getRuleStatus();
4214 (void)rstatus; // Suppress set but not used warning.
51004dcb 4215 if (iterationCount >= 10) {
2ca993e8 4216 break;
51004dcb
A
4217 }
4218 }
4219 TEST_ASSERT(iterationCount == 6);
4220}
4221
f3c0d7a5
A
4222// Bug 7547 - verify that building a break itereator from empty rules produces an error.
4223//
4224void RBBITest::TestBug7547() {
4225 UnicodeString rules;
4226 UErrorCode status = U_ZERO_ERROR;
4227 UParseError parseError;
4228 RuleBasedBreakIterator breakIterator(rules, parseError, status);
4229 if (status != U_BRK_RULE_SYNTAX) {
4230 errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status));
4231 }
4232 if (parseError.line != 1 || parseError.offset != 0) {
4233 errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset);
4234 }
4235}
4236
4237
4238void RBBITest::TestBug12797() {
4239 UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
4240 UErrorCode status = U_ZERO_ERROR;
4241 UParseError parseError;
4242 RuleBasedBreakIterator bi(rules, parseError, status);
4243 if (U_FAILURE(status)) {
4244 errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status));
4245 return;
4246 }
4247 UnicodeString text = "abc";
4248 bi.setText(text);
4249 bi.first();
4250 int32_t boundary = bi.next();
4251 if (boundary != 3) {
4252 errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary);
4253 }
4254}
4255
4256void RBBITest::TestBug12918() {
4257 // This test triggers an assertion failure in dictbe.cpp
4258 const UChar *crasherString = u"\u3325\u4a16";
4259 UErrorCode status = U_ZERO_ERROR;
4260 UBreakIterator* iter = ubrk_open(UBRK_WORD, NULL, crasherString, -1, &status);
4261 if (U_FAILURE(status)) {
4262 dataerrln("%s:%d status = %s", __FILE__, __LINE__, u_errorName(status));
4263 return;
4264 }
4265 ubrk_first(iter);
4266 int32_t pos = 0;
4267 int32_t lastPos = -1;
4268 while((pos = ubrk_next(iter)) != UBRK_DONE) {
4269 if (pos <= lastPos) {
4270 errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__, __LINE__, pos, lastPos);
4271 break;
4272 }
4273 }
4274 ubrk_close(iter);
4275}
4276
4277void RBBITest::TestBug12932() {
4278 // Node Stack overflow in the RBBI rule parser caused a seg fault.
4279 UnicodeString ruleStr(
4280 "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4281 "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4282 "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((()))"
4283 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4284 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4285 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))");
4286
4287 UErrorCode status = U_ZERO_ERROR;
4288 UParseError parseError;
4289 RuleBasedBreakIterator rbbi(ruleStr, parseError, status);
4290 if (status != U_BRK_RULE_SYNTAX) {
4291 errln("%s:%d expected U_BRK_RULE_SYNTAX, got %s",
4292 __FILE__, __LINE__, u_errorName(status));
4293 }
4294}
4295
4296
4297// Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt
4298// remain undevided by ICU char, word and line break.
4299void RBBITest::TestEmoji() {
0f5d89e8 4300#if !UCONFIG_NO_REGULAR_EXPRESSIONS
f3c0d7a5
A
4301 UErrorCode status = U_ZERO_ERROR;
4302
4303 CharString testFileName;
4304 testFileName.append(IntlTest::getSourceTestData(status), status);
4305 testFileName.appendPathPart("emoji-test.txt", status);
4306 if (U_FAILURE(status)) {
4307 errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4308 return;
4309 }
4310 logln("Opening data file %s\n", testFileName.data());
4311
4312 int len;
4313 UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
4314 if (U_FAILURE(status) || testFile == NULL) {
4315 errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4316 return;
4317 }
4318 UnicodeString testFileAsString(testFile, len);
4319 delete [] testFile;
4320
4321 RegexMatcher lineMatcher(u"^.*?$", testFileAsString, UREGEX_MULTILINE, status);
4322 RegexMatcher hexMatcher(u"\\s*([a-f0-9]*)", UREGEX_CASE_INSENSITIVE, status);
4323 // hexMatcher group(1) is a hex number, or empty string if no hex number present.
4324 int32_t lineNumber = 0;
4325
4326 LocalPointer<BreakIterator> charBreaks(BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4327 LocalPointer<BreakIterator> wordBreaks(BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4328 LocalPointer<BreakIterator> lineBreaks(BreakIterator::createLineInstance(Locale::getEnglish(), status), status);
4329 if (U_FAILURE(status)) {
4330 dataerrln("%s:%d %s while opening break iterators", __FILE__, __LINE__, u_errorName(status));
4331 return;
4332 }
4333
4334 while (lineMatcher.find()) {
4335 ++lineNumber;
4336 UnicodeString line = lineMatcher.group(status);
4337 hexMatcher.reset(line);
4338 UnicodeString testString; // accumulates the emoji sequence.
4339 while (hexMatcher.find() && hexMatcher.group(1, status).length() > 0) {
4340 UnicodeString hex = hexMatcher.group(1, status);
4341 if (hex.length() > 8) {
4342 errln("%s:%d emoji-test.txt:%d invalid code point %s", __FILE__, __LINE__, lineNumber, CStr(hex)());
4343 break;
4344 }
4345 CharString hex8;
4346 hex8.appendInvariantChars(hex, status);
4347 UChar32 c = (UChar32)strtol(hex8.data(), NULL, 16);
4348 if (c<=0x10ffff) {
4349 testString.append(c);
4350 } else {
4351 errln("%s:%d emoji-test.txt:%d Error: Unicode Character %s value out of range.",
4352 __FILE__, __LINE__, lineNumber, hex8.data());
4353 break;
4354 }
4355 }
4356
4357 if (testString.length() > 1) {
4358 charBreaks->setText(testString);
4359 charBreaks->first();
4360 int32_t firstBreak = charBreaks->next();
4361 if (testString.length() != firstBreak) {
4362 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4363 __FILE__, __LINE__, lineNumber, firstBreak);
4364 }
4365 wordBreaks->setText(testString);
4366 wordBreaks->first();
4367 firstBreak = wordBreaks->next();
4368 if (testString.length() != firstBreak) {
4369 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4370 __FILE__, __LINE__, lineNumber, firstBreak);
4371 }
4372 lineBreaks->setText(testString);
4373 lineBreaks->first();
4374 firstBreak = lineBreaks->next();
4375 if (testString.length() != firstBreak) {
4376 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4377 __FILE__, __LINE__, lineNumber, firstBreak);
4378 }
4379 }
4380 }
0f5d89e8
A
4381#endif
4382}
4383
4384
4385// TestBug12519 - Correct handling of Locales by assignment / copy / clone
4386
0f5d89e8
A
4387void RBBITest::TestBug12519() {
4388 UErrorCode status = U_ZERO_ERROR;
4389 LocalPointer<RuleBasedBreakIterator> biEn((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4390 LocalPointer<RuleBasedBreakIterator> biFr((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getFrance(), status));
4391 if (!assertSuccess(WHERE, status)) {
4392 dataerrln("%s %d status = %s", __FILE__, __LINE__, u_errorName(status));
4393 return;
4394 }
4395 assertTrue(WHERE, Locale::getEnglish() == biEn->getLocale(ULOC_VALID_LOCALE, status));
4396
4397 assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status));
4398 assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr);
4399
4400 LocalPointer<RuleBasedBreakIterator>cloneEn((RuleBasedBreakIterator *)biEn->clone());
4401 assertTrue(WHERE, *biEn == *cloneEn);
4402 assertTrue(WHERE, Locale::getEnglish() == cloneEn->getLocale(ULOC_VALID_LOCALE, status));
4403
4404 LocalPointer<RuleBasedBreakIterator>cloneFr((RuleBasedBreakIterator *)biFr->clone());
4405 assertTrue(WHERE, *biFr == *cloneFr);
4406 assertTrue(WHERE, Locale::getFrench() == cloneFr->getLocale(ULOC_VALID_LOCALE, status));
4407
4408 LocalPointer<RuleBasedBreakIterator>biDe((RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getGerman(), status));
4409 UnicodeString text("Hallo Welt");
4410 biDe->setText(text);
4411 assertTrue(WHERE "before assignment of \"biDe = biFr\", they should be different, but are equal.", *biFr != *biDe);
4412 *biDe = *biFr;
4413 assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe);
4414}
4415
4416void RBBITest::TestBug12677() {
4417 // Check that stripping of comments from rules for getRules() is not confused by
4418 // the presence of '#' characters in the rules that do not introduce comments.
4419 UnicodeString rules(u"!!forward; \n"
4420 "$x = [ab#]; # a set with a # literal. \n"
4421 " # .; # a comment that looks sort of like a rule. \n"
4422 " '#' '?'; # a rule with a quoted # \n"
4423 );
4424
4425 UErrorCode status = U_ZERO_ERROR;
4426 UParseError pe;
4427 RuleBasedBreakIterator bi(rules, pe, status);
4428 assertSuccess(WHERE, status);
4429 UnicodeString rtRules = bi.getRules();
4430 assertEquals(WHERE, UnicodeString(u"!!forward; $x = [ab#]; '#' '?'; "), rtRules);
4431}
4432
4433
4434void RBBITest::TestTableRedundancies() {
4435 UErrorCode status = U_ZERO_ERROR;
4436
4437 LocalPointer<RuleBasedBreakIterator> bi (
4438 (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status));
4439 assertSuccess(WHERE, status);
4440 if (U_FAILURE(status)) return;
4441
4442 RBBIDataWrapper *dw = bi->fData;
4443 const RBBIStateTable *fwtbl = dw->fForwardTable;
4444 int32_t numCharClasses = dw->fHeader->fCatCount;
4445 // printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates);
4446
4447 // Check for duplicate columns (character categories)
4448
4449 std::vector<UnicodeString> columns;
4450 for (int32_t column = 0; column < numCharClasses; column++) {
4451 UnicodeString s;
4452 for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) {
4453 RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4454 s.append(row->fNextState[column]);
4455 }
4456 columns.push_back(s);
4457 }
4458 // Ignore column (char class) 0 while checking; it's special, and may have duplicates.
4459 for (int c1=1; c1<numCharClasses; c1++) {
4460 for (int c2 = c1+1; c2 < numCharClasses; c2++) {
4461 if (columns.at(c1) == columns.at(c2)) {
4462 errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__, __LINE__, c1, c2);
4463 goto out;
4464 }
4465 }
4466 }
4467 out:
4468
4469 // Check for duplicate states
4470 std::vector<UnicodeString> rows;
4471 for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) {
4472 UnicodeString s;
4473 RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4474 assertTrue(WHERE, row->fAccepting >= -1);
4475 s.append(row->fAccepting + 1); // values of -1 are expected.
4476 s.append(row->fLookAhead);
4477 s.append(row->fTagIdx);
4478 for (int32_t column = 0; column < numCharClasses; column++) {
4479 s.append(row->fNextState[column]);
4480 }
4481 rows.push_back(s);
4482 }
4483 for (int r1=0; r1 < (int32_t)fwtbl->fNumStates; r1++) {
4484 for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) {
4485 if (rows.at(r1) == rows.at(r2)) {
4486 errln("%s:%d Duplicate rows (%d, %d)\n", __FILE__, __LINE__, r1, r2);
4487 return;
4488 }
4489 }
4490 }
4491}
4492
4493// Bug 13447: verify that getRuleStatus() returns the value corresponding to current(),
4494// even after next() has returned DONE.
4495
4496void RBBITest::TestBug13447() {
4497 UErrorCode status = U_ZERO_ERROR;
4498 LocalPointer<RuleBasedBreakIterator> bi(
4499 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4500 assertSuccess(WHERE, status);
4501 if (U_FAILURE(status)) return;
4502 UnicodeString data(u"1234");
4503 bi->setText(data);
4504 assertEquals(WHERE, UBRK_WORD_NONE, bi->getRuleStatus());
4505 assertEquals(WHERE, 4, bi->next());
4506 assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4507 assertEquals(WHERE, UBRK_DONE, bi->next());
4508 assertEquals(WHERE, 4, bi->current());
4509 assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4510}
4511
4512// TestReverse exercises both the synthesized safe reverse rules and the logic
4513// for filling the break iterator cache when starting from random positions
4514// in the text.
4515//
4516// It's a monkey test, working on random data, with the expected data obtained
4517// from forward iteration (no safe rules involved), comparing with results
4518// when indexing into the interior of the string (safe rules needed).
4519
4520void RBBITest::TestReverse() {
4521 UErrorCode status = U_ZERO_ERROR;
4522
4523 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4524 BreakIterator::createCharacterInstance(Locale::getEnglish(), status)));
4525 assertSuccess(WHERE, status, true);
4526 status = U_ZERO_ERROR;
4527 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4528 BreakIterator::createWordInstance(Locale::getEnglish(), status)));
4529 assertSuccess(WHERE, status, true);
4530 status = U_ZERO_ERROR;
4531 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4532 BreakIterator::createLineInstance(Locale::getEnglish(), status)));
4533 assertSuccess(WHERE, status, true);
4534 status = U_ZERO_ERROR;
4535 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4536 BreakIterator::createSentenceInstance(Locale::getEnglish(), status)));
4537 assertSuccess(WHERE, status, true);
4538}
4539
4540void RBBITest::TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi) {
4541 if (!bi) {
4542 return;
4543 }
4544
4545 // From the mapping trie in the break iterator's internal data, create a
4546 // vector of UnicodeStrings, one for each character category, containing
4547 // all of the code points that map to that category. Unicode planes 0 and 1 only,
4548 // to avoid an execess of unassigned code points.
4549
4550 RBBIDataWrapper *data = bi->fData;
4551 int32_t categoryCount = data->fHeader->fCatCount;
4552 UTrie2 *trie = data->fTrie;
4553
4554 std::vector<UnicodeString> strings(categoryCount, UnicodeString());
4555 for (int cp=0; cp<0x1fff0; ++cp) {
4556 int cat = utrie2_get32(trie, cp);
4557 cat &= ~0x4000; // And off the dictionary bit from the category.
4558 assertTrue(WHERE, cat < categoryCount && cat >= 0);
4559 if (cat < 0 || cat >= categoryCount) return;
4560 strings[cat].append(cp);
4561 }
4562
4563 icu_rand randomGen;
4564 const int testStringLength = 10000;
4565 UnicodeString testString;
4566
4567 for (int i=0; i<testStringLength; ++i) {
4568 int charClass = randomGen() % categoryCount;
4569 if (strings[charClass].length() > 0) {
4570 int cp = strings[charClass].char32At(randomGen() % strings[charClass].length());
4571 testString.append(cp);
4572 }
4573 }
4574
4575 typedef std::pair<UBool, int32_t> Result;
4576 std::vector<Result> expectedResults;
4577 bi->setText(testString);
4578 for (int i=0; i<testString.length(); ++i) {
4579 bool isboundary = bi->isBoundary(i);
4580 int ruleStatus = bi->getRuleStatus();
4581 expectedResults.push_back(std::make_pair(isboundary, ruleStatus));
4582 }
4583
4584 for (int i=testString.length()-1; i>=0; --i) {
4585 bi->setText(testString); // clears the internal break cache
4586 Result expected = expectedResults[i];
4587 assertEquals(WHERE, expected.first, bi->isBoundary(i));
4588 assertEquals(WHERE, expected.second, bi->getRuleStatus());
4589 }
f3c0d7a5
A
4590}
4591
51004dcb 4592
0f5d89e8
A
4593// Ticket 13692 - finding word boundaries in very large numbers or words could
4594// be very time consuming. When the problem was present, this void test
4595// would run more than fifteen minutes, which is to say, the failure was noticeale.
4596
4597void RBBITest::TestBug13692() {
4598 UErrorCode status = U_ZERO_ERROR;
4599 LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4600 BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4601 if (!assertSuccess(WHERE, status, true)) {
4602 return;
4603 }
4604 constexpr int32_t LENGTH = 1000000;
4605 UnicodeString longNumber(LENGTH, (UChar32)u'3', LENGTH);
4606 for (int i=0; i<20; i+=2) {
4607 longNumber.setCharAt(i, u' ');
4608 }
4609 bi->setText(longNumber);
4610 assertFalse(WHERE, bi->isBoundary(LENGTH-5));
4611 assertSuccess(WHERE, status);
4612}
4613
73c04bcf
A
4614//
4615// TestDebug - A place-holder test for debugging purposes.
4616// For putting in fragments of other tests that can be invoked
4617// for tracing without a lot of unwanted extra stuff happening.
4618//
4619void RBBITest::TestDebug(void) {
0f5d89e8
A
4620 UErrorCode status = U_ZERO_ERROR;
4621 LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4622 BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4623 if (!assertSuccess(WHERE, status, true)) {
4624 return;
4625 }
4626 const UnicodeString &rules = bi->getRules();
4627 UParseError pe;
4628 LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
4629 assertSuccess(WHERE, status);
73c04bcf
A
4630}
4631
4388f060
A
4632void RBBITest::TestProperties() {
4633 UErrorCode errorCode = U_ZERO_ERROR;
4634 UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4635 if (!prependSet.isEmpty()) {
4636 errln(
4637 "[:GCB=Prepend:] is not empty any more. "
4638 "Uncomment relevant lines in source/data/brkitr/char.txt and "
4639 "change this test to the opposite condition.");
4640 }
4641}
4642
0f5d89e8 4643#endif // #if !UCONFIG_NO_BREAK_ITERATION