]> git.saurik.com Git - apple/icu.git/blame - icuSources/test/intltest/rbbitst.cpp
ICU-59117.0.1.tar.gz
[apple/icu.git] / icuSources / test / intltest / rbbitst.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
73c04bcf
A
3/********************************************************************
4 * COPYRIGHT:
2ca993e8 5 * Copyright (c) 1999-2016, International Business Machines Corporation and
73c04bcf
A
6 * others. All Rights Reserved.
7 ********************************************************************/
8/************************************************************************
9* Date Name Description
10* 12/15/99 Madhu Creation.
11* 01/12/2000 Madhu Updated for changed API and added new tests
12************************************************************************/
13
14#include "unicode/utypes.h"
73c04bcf
A
15#if !UCONFIG_NO_BREAK_ITERATION
16
2ca993e8
A
17#include <stdio.h>
18#include <stdlib.h>
19#include <string.h>
20
73c04bcf 21#include "unicode/brkiter.h"
2ca993e8
A
22#include "unicode/localpointer.h"
23#include "unicode/numfmt.h"
73c04bcf 24#include "unicode/rbbi.h"
2ca993e8
A
25#if !UCONFIG_NO_REGULAR_EXPRESSIONS
26#include "unicode/regex.h"
27#endif
28#include "unicode/schriter.h"
73c04bcf
A
29#include "unicode/uchar.h"
30#include "unicode/utf16.h"
31#include "unicode/ucnv.h"
73c04bcf 32#include "unicode/uniset.h"
2ca993e8 33#include "unicode/uscript.h"
73c04bcf
A
34#include "unicode/ustring.h"
35#include "unicode/utext.h"
2ca993e8
A
36
37#include "charstr.h"
38#include "cmemory.h"
f3c0d7a5 39#include "cstr.h"
73c04bcf
A
40#include "intltest.h"
41#include "rbbitst.h"
2ca993e8 42#include "utypeinfo.h" // for 'typeid' to work
73c04bcf
A
43#include "uvector.h"
44#include "uvectr32.h"
2ca993e8
A
45
46#if !UCONFIG_NO_FILTERED_BREAK_ITERATION
47#include "unicode/filteredbrk.h"
48#endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
73c04bcf
A
49
50#define TEST_ASSERT(x) {if (!(x)) { \
51 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
52
46f4442e 53#define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
729e4ab9 54 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
73c04bcf
A
55
56
46f4442e
A
57//---------------------------------------------
58// runIndexedTest
59//---------------------------------------------
60
4388f060 61
2ca993e8 62// Note: Before adding new tests to this file, check whether the desired test data can
4388f060
A
63// simply be added to the file testdata/rbbitest.txt. In most cases it can,
64// it's much less work than writing a new test, diagnostic output in the event of failures
65// is good, and the test data file will is shared with ICU4J, so eventually the test
66// will run there as well, without additional effort.
67
46f4442e
A
68void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
69{
70 if (exec) logln("TestSuite RuleBasedBreakIterator: ");
f3c0d7a5 71 fTestParams = params;
46f4442e 72
f3c0d7a5 73 TESTCASE_AUTO_BEGIN;
729e4ab9 74#if !UCONFIG_NO_FILE_IO
f3c0d7a5 75 TESTCASE_AUTO(TestBug4153072);
729e4ab9 76#endif
f3c0d7a5 77 TESTCASE_AUTO(TestStatusReturn);
729e4ab9 78#if !UCONFIG_NO_FILE_IO
f3c0d7a5
A
79 TESTCASE_AUTO(TestUnicodeFiles);
80 TESTCASE_AUTO(TestEmptyString);
729e4ab9 81#endif
f3c0d7a5
A
82 TESTCASE_AUTO(TestGetAvailableLocales);
83 TESTCASE_AUTO(TestGetDisplayName);
729e4ab9 84#if !UCONFIG_NO_FILE_IO
f3c0d7a5
A
85 TESTCASE_AUTO(TestEndBehaviour);
86 TESTCASE_AUTO(TestWordBreaks);
87 TESTCASE_AUTO(TestWordBoundary);
88 TESTCASE_AUTO(TestLineBreaks);
89 TESTCASE_AUTO(TestSentBreaks);
90 TESTCASE_AUTO(TestExtended);
729e4ab9 91#endif
4388f060 92#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
f3c0d7a5 93 TESTCASE_AUTO(TestMonkey);
4388f060 94#endif
729e4ab9 95#if !UCONFIG_NO_FILE_IO
f3c0d7a5 96 TESTCASE_AUTO(TestBug3818);
729e4ab9 97#endif
f3c0d7a5 98 TESTCASE_AUTO(TestDebug);
729e4ab9 99#if !UCONFIG_NO_FILE_IO
f3c0d7a5 100 TESTCASE_AUTO(TestBug5775);
729e4ab9 101#endif
f3c0d7a5
A
102 TESTCASE_AUTO(TestBug9983);
103 TESTCASE_AUTO(TestDictRules);
104 TESTCASE_AUTO(TestBug5532);
105 TESTCASE_AUTO(TestBug7547);
106 TESTCASE_AUTO(TestBug12797);
107 TESTCASE_AUTO(TestBug12918);
108 TESTCASE_AUTO(TestBug12932);
109 TESTCASE_AUTO(TestEmoji);
110 TESTCASE_AUTO_END;
46f4442e
A
111}
112
113
73c04bcf
A
114//---------------------------------------------------------------------------
115//
116// class BITestData Holds a set of Break iterator test data and results
117// Includes
118// - the string data to be broken
119// - a vector of the expected break positions.
120// - a vector of source line numbers for the data,
121// (to help see where errors occured.)
122// - The expected break tag values.
123// - Vectors of actual break positions and tag values.
124// - Functions for comparing actual with expected and
125// reporting errors.
126//
127//----------------------------------------------------------------------------
128class BITestData {
129public:
130 UnicodeString fDataToBreak;
131 UVector fExpectedBreakPositions;
132 UVector fExpectedTags;
133 UVector fLineNum;
134 UVector fActualBreakPositions; // Test Results.
135 UVector fActualTags;
136
137 BITestData(UErrorCode &status);
138 void addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
139 void checkResults(const char *heading, RBBITest *test);
140 void err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
141 void clearResults();
142};
143
144//
145// Constructor.
146//
147BITestData::BITestData(UErrorCode &status)
148: fExpectedBreakPositions(status), fExpectedTags(status), fLineNum(status), fActualBreakPositions(status),
149 fActualTags(status)
150{
151}
152
153//
154// addDataChunk. Add a section (non-breaking) piece if data to the test data.
155// The macro form collects the line number, which is helpful
156// when tracking down failures.
157//
158// A null data item is inserted at the start of each test's data
159// to put the starting zero into the data list. The position saved for
160// each non-null item is its ending position.
161//
162#define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status);
163void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
164 if (U_FAILURE(status)) {return;}
165 if (data != NULL) {
166 fDataToBreak.append(CharsToUnicodeString(data));
167 }
168 fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
169 fExpectedTags.addElement(tag, status);
170 fLineNum.addElement(lineNum, status);
171}
172
173
174//
175// checkResults. Compare the actual and expected break positions, report any differences.
176//
177void BITestData::checkResults(const char *heading, RBBITest *test) {
178 int32_t expectedIndex = 0;
179 int32_t actualIndex = 0;
180
181 for (;;) {
182 // If we've run through both the expected and actual results vectors, we're done.
183 // break out of the loop.
184 if (expectedIndex >= fExpectedBreakPositions.size() &&
185 actualIndex >= fActualBreakPositions.size()) {
186 break;
187 }
188
189
190 if (expectedIndex >= fExpectedBreakPositions.size()) {
191 err(heading, test, expectedIndex-1, actualIndex);
192 actualIndex++;
193 continue;
194 }
195
196 if (actualIndex >= fActualBreakPositions.size()) {
197 err(heading, test, expectedIndex, actualIndex-1);
198 expectedIndex++;
199 continue;
200 }
201
202 if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
203 err(heading, test, expectedIndex, actualIndex);
204 // Try to resync the positions of the indices, to avoid a rash of spurious erros.
205 if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
206 actualIndex++;
207 } else {
208 expectedIndex++;
209 }
210 continue;
211 }
212
213 if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
214 test->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d",
215 heading, fLineNum.elementAt(expectedIndex),
216 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
217 }
218
219 actualIndex++;
220 expectedIndex++;
221 }
222}
223
224//
225// err - An error was found. Report it, along with information about where the
226// incorrectly broken test data appeared in the source file.
227//
228void BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
229{
230 int32_t expected = fExpectedBreakPositions.elementAti(expectedIdx);
231 int32_t actual = fActualBreakPositions.elementAti(actualIdx);
232 int32_t o = 0;
233 int32_t line = fLineNum.elementAti(expectedIdx);
234 if (expectedIdx > 0) {
235 // The line numbers are off by one because a premature break occurs somewhere
236 // within the previous item, rather than at the start of the current (expected) item.
237 // We want to report the offset of the unexpected break from the start of
238 // this previous item.
239 o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
240 }
241 if (actual < expected) {
46f4442e 242 test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading, o, line, actual, expected);
73c04bcf 243 } else {
46f4442e 244 test->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading, line, actual, expected);
73c04bcf
A
245 }
246}
247
248
249void BITestData::clearResults() {
250 fActualBreakPositions.removeAllElements();
251 fActualTags.removeAllElements();
252}
253
254
73c04bcf
A
255//--------------------------------------------------------------------------------------
256//
257// RBBITest constructor and destructor
258//
259//--------------------------------------------------------------------------------------
260
261RBBITest::RBBITest() {
f3c0d7a5 262 fTestParams = NULL;
73c04bcf
A
263}
264
265
266RBBITest::~RBBITest() {
73c04bcf
A
267}
268
73c04bcf
A
269//-----------------------------------------------------------------------------------
270//
271// Test for status {tag} return value from break rules.
272// TODO: a more thorough test.
273//
274//-----------------------------------------------------------------------------------
275void RBBITest::TestStatusReturn() {
46f4442e 276 UnicodeString rulesString1("$Letters = [:L:];\n"
73c04bcf
A
277 "$Numbers = [:N:];\n"
278 "$Letters+{1};\n"
279 "$Numbers+{2};\n"
2ca993e8 280 "Help\\ /me\\!{4};\n"
73c04bcf 281 "[^$Letters $Numbers];\n"
46f4442e 282 "!.*;\n", -1, US_INV);
73c04bcf
A
283 UnicodeString testString1 = "abc123..abc Help me Help me!";
284 // 01234567890123456789012345678
285 int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
286 int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1};
287
288 UErrorCode status=U_ZERO_ERROR;
289 UParseError parseError;
290
2ca993e8 291 LocalPointer <BreakIterator> bi(new RuleBasedBreakIterator(rulesString1, parseError, status));
73c04bcf 292 if(U_FAILURE(status)) {
2ca993e8
A
293 dataerrln("%s:%d error in break iterator construction - %s", __FILE__, __LINE__, u_errorName(status));
294 return;
295 }
296 int32_t pos;
297 int32_t i = 0;
298 bi->setText(testString1);
299 for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
300 if (pos != bounds1[i]) {
301 errln("%s:%d expected break at %d, got %d\n", __FILE__, __LINE__, bounds1[i], pos);
302 break;
303 }
73c04bcf 304
2ca993e8
A
305 int tag = bi->getRuleStatus();
306 if (tag != brkStatus[i]) {
307 errln("%s:%d break at %d, expected tag %d, got tag %d\n", __FILE__, __LINE__, pos, brkStatus[i], tag);
308 break;
73c04bcf 309 }
2ca993e8 310 i++;
73c04bcf 311 }
73c04bcf
A
312}
313
314
b331163b 315static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
73c04bcf
A
316 UErrorCode status = U_ZERO_ERROR;
317 char name[100];
318 printf("code alpha extend alphanum type word sent line name\n");
b331163b
A
319 int nextExpectedIndex = 0;
320 utext_setNativeIndex(tstr, 0);
321 for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) {
322 if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
323 printf("------------------------------------------------ %d\n", j);
324 ++nextExpectedIndex;
73c04bcf 325 }
b331163b
A
326
327 UChar32 c = utext_next32(tstr);
73c04bcf
A
328 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
329 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
330 u_isUAlphabetic(c),
331 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
332 u_isalnum(c),
333 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
334 u_charType(c),
335 U_SHORT_PROPERTY_NAME),
336 u_getPropertyValueName(UCHAR_WORD_BREAK,
337 u_getIntPropertyValue(c,
338 UCHAR_WORD_BREAK),
339 U_SHORT_PROPERTY_NAME),
340 u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
341 u_getIntPropertyValue(c,
342 UCHAR_SENTENCE_BREAK),
343 U_SHORT_PROPERTY_NAME),
344 u_getPropertyValueName(UCHAR_LINE_BREAK,
345 u_getIntPropertyValue(c,
346 UCHAR_LINE_BREAK),
347 U_SHORT_PROPERTY_NAME),
348 name);
349 }
350}
351
73c04bcf 352
b331163b
A
353static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
354 UErrorCode status = U_ZERO_ERROR;
355 UText *tstr = NULL;
356 tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
357 if (U_FAILURE(status)) {
358 printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
359 return;
360 }
361 printStringBreaks(tstr, expected, expectedCount);
362 utext_close(tstr);
363}
364
365
73c04bcf
A
366void RBBITest::TestBug3818() {
367 UErrorCode status = U_ZERO_ERROR;
368
369 // Four Thai words...
370 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
371 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
372 UnicodeString thaiStr(thaiWordData);
373
57a6839d 374 BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
73c04bcf 375 if (U_FAILURE(status) || bi == NULL) {
729e4ab9 376 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
73c04bcf
A
377 return;
378 }
379 bi->setText(thaiStr);
380
381 int32_t startOfSecondWord = bi->following(1);
382 if (startOfSecondWord != 4) {
383 errln("Fail at file %s, line %d expected start of word at 4, got %d",
384 __FILE__, __LINE__, startOfSecondWord);
385 }
386 startOfSecondWord = bi->following(0);
387 if (startOfSecondWord != 4) {
388 errln("Fail at file %s, line %d expected start of word at 4, got %d",
389 __FILE__, __LINE__, startOfSecondWord);
390 }
391 delete bi;
392}
393
73c04bcf
A
394//----------------------------------------------------------------------------
395//
396// generalIteratorTest Given a break iterator and a set of test data,
397// Run the tests and report the results.
398//
399//----------------------------------------------------------------------------
400void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
401{
402
403 bi.setText(td.fDataToBreak);
404
405 testFirstAndNext(bi, td);
406
407 testLastAndPrevious(bi, td);
408
409 testFollowing(bi, td);
410 testPreceding(bi, td);
411 testIsBoundary(bi, td);
412 doMultipleSelectionTest(bi, td);
413}
414
415
416//
417// testFirstAndNext. Run the iterator forwards in the obvious first(), next()
418// kind of loop.
419//
420void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
421{
422 UErrorCode status = U_ZERO_ERROR;
423 int32_t p;
424 int32_t lastP = -1;
425 int32_t tag;
426
427 logln("Test first and next");
428 bi.setText(td.fDataToBreak);
429 td.clearResults();
430
431 for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
432 td.fActualBreakPositions.addElement(p, status); // Save result.
433 tag = bi.getRuleStatus();
434 td.fActualTags.addElement(tag, status);
435 if (p <= lastP) {
436 // If the iterator is not making forward progress, stop.
437 // No need to raise an error here, it'll be detected in the normal check of results.
438 break;
439 }
440 lastP = p;
441 }
442 td.checkResults("testFirstAndNext", this);
443}
444
445
446//
447// TestLastAndPrevious. Run the iterator backwards, starting with last().
448//
449void RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td)
450{
451 UErrorCode status = U_ZERO_ERROR;
452 int32_t p;
453 int32_t lastP = 0x7ffffffe;
454 int32_t tag;
455
46f4442e 456 logln("Test last and previous");
73c04bcf
A
457 bi.setText(td.fDataToBreak);
458 td.clearResults();
459
460 for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
461 // Save break position. Insert it at start of vector of results, shoving
462 // already-saved results further towards the end.
463 td.fActualBreakPositions.insertElementAt(p, 0, status);
464 // bi.previous(); // TODO: Why does this fix things up????
465 // bi.next();
466 tag = bi.getRuleStatus();
467 td.fActualTags.insertElementAt(tag, 0, status);
468 if (p >= lastP) {
469 // If the iterator is not making progress, stop.
470 // No need to raise an error here, it'll be detected in the normal check of results.
471 break;
472 }
473 lastP = p;
474 }
475 td.checkResults("testLastAndPrevious", this);
476}
477
478
479void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
480{
481 UErrorCode status = U_ZERO_ERROR;
482 int32_t p;
483 int32_t tag;
484 int32_t lastP = -2; // A value that will never be returned as a break position.
485 // cannot be -1; that is returned for DONE.
486 int i;
487
488 logln("testFollowing():");
489 bi.setText(td.fDataToBreak);
490 td.clearResults();
491
492 // Save the starting point, since we won't get that out of following.
493 p = bi.first();
494 td.fActualBreakPositions.addElement(p, status); // Save result.
495 tag = bi.getRuleStatus();
496 td.fActualTags.addElement(tag, status);
497
498 for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
499 p = bi.following(i);
500 if (p != lastP) {
501 if (p == RuleBasedBreakIterator::DONE) {
502 break;
503 }
504 // We've reached a new break position. Save it.
505 td.fActualBreakPositions.addElement(p, status); // Save result.
506 tag = bi.getRuleStatus();
507 td.fActualTags.addElement(tag, status);
508 lastP = p;
509 }
510 }
511 // The loop normally exits by means of the break in the middle.
512 // Make sure that the index was at the correct position for the break iterator to have
513 // returned DONE.
514 if (i != td.fDataToBreak.length()) {
515 errln("testFollowing(): iterator returned DONE prematurely.");
516 }
517
518 // Full check of all results.
519 td.checkResults("testFollowing", this);
520}
521
522
523
524void RBBITest::testPreceding(RuleBasedBreakIterator& bi, BITestData &td) {
525 UErrorCode status = U_ZERO_ERROR;
526 int32_t p;
527 int32_t tag;
528 int32_t lastP = 0x7ffffffe;
529 int i;
530
531 logln("testPreceding():");
532 bi.setText(td.fDataToBreak);
533 td.clearResults();
534
535 p = bi.last();
536 td.fActualBreakPositions.addElement(p, status);
537 tag = bi.getRuleStatus();
538 td.fActualTags.addElement(tag, status);
539
540 for (i = td.fDataToBreak.length(); i>=-1; i--) {
541 p = bi.preceding(i);
542 if (p != lastP) {
543 if (p == RuleBasedBreakIterator::DONE) {
544 break;
545 }
546 // We've reached a new break position. Save it.
547 td.fActualBreakPositions.insertElementAt(p, 0, status);
548 lastP = p;
549 tag = bi.getRuleStatus();
550 td.fActualTags.insertElementAt(tag, 0, status);
551 }
552 }
553 // The loop normally exits by means of the break in the middle.
554 // Make sure that the index was at the correct position for the break iterator to have
555 // returned DONE.
556 if (i != 0) {
557 errln("testPreceding(): iterator returned DONE prematurely.");
558 }
559
560 // Full check of all results.
561 td.checkResults("testPreceding", this);
562}
563
564
565
566void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td) {
567 UErrorCode status = U_ZERO_ERROR;
568 int i;
569 int32_t tag;
570
571 logln("testIsBoundary():");
572 bi.setText(td.fDataToBreak);
573 td.clearResults();
574
575 for (i = 0; i <= td.fDataToBreak.length(); i++) {
576 if (bi.isBoundary(i)) {
577 td.fActualBreakPositions.addElement(i, status); // Save result.
578 tag = bi.getRuleStatus();
579 td.fActualTags.addElement(tag, status);
580 }
581 }
582 td.checkResults("testIsBoundary: ", this);
583}
584
585
586
587void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
588{
589 iterator.setText(td.fDataToBreak);
590
591 RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
592 int32_t offset = iterator.first();
593 int32_t testOffset;
594 int32_t count = 0;
595
596 logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
597
598 if (*testIterator != iterator)
599 errln("clone() or operator!= failed: two clones compared unequal");
600
601 do {
602 testOffset = testIterator->first();
603 testOffset = testIterator->next(count);
604 if (offset != testOffset)
605 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
606
607 if (offset != RuleBasedBreakIterator::DONE) {
608 count++;
609 offset = iterator.next();
610
611 if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
612 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
613 if (count > 10000 || offset == -1) {
614 errln("operator== failed too many times. Stopping test.");
615 if (offset == -1) {
616 errln("Does (RuleBasedBreakIterator::DONE == -1)?");
617 }
618 return;
619 }
620 }
621 }
622 } while (offset != RuleBasedBreakIterator::DONE);
623
624 // now do it backwards...
625 offset = iterator.last();
626 count = 0;
627
628 do {
629 testOffset = testIterator->last();
630 testOffset = testIterator->next(count); // next() with a negative arg is same as previous
631 if (offset != testOffset)
632 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
633
634 if (offset != RuleBasedBreakIterator::DONE) {
635 count--;
636 offset = iterator.previous();
637 }
638 } while (offset != RuleBasedBreakIterator::DONE);
639
640 delete testIterator;
641}
642
643
644//---------------------------------------------
645//
646// other tests
647//
648//---------------------------------------------
649void RBBITest::TestEmptyString()
650{
651 UnicodeString text = "";
652 UErrorCode status = U_ZERO_ERROR;
653
654 BITestData x(status);
655 ADD_DATACHUNK(x, "", 0, status); // Break at start of data
656 RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
657 if (U_FAILURE(status))
658 {
729e4ab9 659 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
73c04bcf
A
660 return;
661 }
662 generalIteratorTest(*bi, x);
663 delete bi;
664}
665
666void RBBITest::TestGetAvailableLocales()
667{
668 int32_t locCount = 0;
669 const Locale* locList = BreakIterator::getAvailableLocales(locCount);
670
671 if (locCount == 0)
729e4ab9 672 dataerrln("getAvailableLocales() returned an empty list!");
73c04bcf
A
673 // Just make sure that it's returning good memory.
674 int32_t i;
675 for (i = 0; i < locCount; ++i) {
676 logln(locList[i].getName());
677 }
678}
679
680//Testing the BreakIterator::getDisplayName() function
681void RBBITest::TestGetDisplayName()
682{
683 UnicodeString result;
684
685 BreakIterator::getDisplayName(Locale::getUS(), result);
686 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
729e4ab9 687 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
73c04bcf
A
688 + result);
689
690 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
691 if (result != "French (France)")
729e4ab9 692 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
73c04bcf
A
693 + result);
694}
695/**
696 * Test End Behaviour
697 * @bug 4068137
698 */
699void RBBITest::TestEndBehaviour()
700{
701 UErrorCode status = U_ZERO_ERROR;
702 UnicodeString testString("boo.");
703 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
704 if (U_FAILURE(status))
705 {
729e4ab9 706 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
73c04bcf
A
707 return;
708 }
709 wb->setText(testString);
710
711 if (wb->first() != 0)
712 errln("Didn't get break at beginning of string.");
713 if (wb->next() != 3)
714 errln("Didn't get break before period in \"boo.\"");
715 if (wb->current() != 4 && wb->next() != 4)
716 errln("Didn't get break at end of string.");
717 delete wb;
718}
719/*
720 * @bug 4153072
721 */
722void RBBITest::TestBug4153072() {
723 UErrorCode status = U_ZERO_ERROR;
724 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
725 if (U_FAILURE(status))
726 {
729e4ab9 727 errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
73c04bcf
A
728 return;
729 }
730 UnicodeString str("...Hello, World!...");
731 int32_t begin = 3;
732 int32_t end = str.length() - 3;
733 UBool onBoundary;
734
735 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
736 iter->adoptText(textIterator);
737 int index;
738 // Note: with the switch to UText, there is no way to restrict the
739 // iteration range to begin at an index other than zero.
740 // String character iterators created with a non-zero bound are
741 // treated by RBBI as being empty.
742 for (index = -1; index < begin + 1; ++index) {
743 onBoundary = iter->isBoundary(index);
744 if (index == 0? !onBoundary : onBoundary) {
745 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
746 " and begin index = " + begin);
747 }
748 }
749 delete iter;
750}
751
752
46f4442e
A
753//
754// Test for problem reported by Ashok Matoria on 9 July 2007
755// One.<kSoftHyphen><kSpace>Two.
756//
757// Sentence break at start (0) and then on calling next() it breaks at
758// 'T' of "Two". Now, at this point if I do next() and
759// then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
760//
761void RBBITest::TestBug5775() {
762 UErrorCode status = U_ZERO_ERROR;
763 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
764 TEST_ASSERT_SUCCESS(status);
729e4ab9
A
765 if (U_FAILURE(status)) {
766 return;
767 }
768// Check for status first for better handling of no data errors.
46f4442e 769 TEST_ASSERT(bi != NULL);
729e4ab9 770 if (bi == NULL) {
46f4442e
A
771 return;
772 }
2ca993e8 773
46f4442e
A
774 UnicodeString s("One.\\u00ad Two.", -1, US_INV);
775 // 01234 56789
776 s = s.unescape();
777 bi->setText(s);
778 int pos = bi->next();
779 TEST_ASSERT(pos == 6);
780 pos = bi->next();
781 TEST_ASSERT(pos == 10);
782 pos = bi->previous();
783 TEST_ASSERT(pos == 6);
784 delete bi;
785}
786
787
788
73c04bcf
A
789//------------------------------------------------------------------------------
790//
791// RBBITest::Extended Run RBBI Tests from an external test data file
792//
793//------------------------------------------------------------------------------
794
795struct TestParams {
b331163b
A
796 BreakIterator *bi; // Break iterator is set while parsing test source.
797 // Changed out whenever test data changes break type.
798
799 UnicodeString dataToBreak; // Data that is built up while parsing the test.
800 UVector32 *expectedBreaks; // Expected break positions, matches dataToBreak UnicodeString.
801 UVector32 *srcLine; // Positions in source file, indexed same as dataToBreak.
73c04bcf 802 UVector32 *srcCol;
b331163b
A
803
804 UText *textToBreak; // UText, could be UTF8 or UTF16.
805 UVector32 *textMap; // Map from UTF-16 dataToBreak offsets to UText offsets.
806 CharString utf8String; // UTF-8 form of text to break.
807
808 TestParams(UErrorCode &status) : dataToBreak() {
809 bi = NULL;
810 expectedBreaks = new UVector32(status);
811 srcLine = new UVector32(status);
812 srcCol = new UVector32(status);
813 textToBreak = NULL;
814 textMap = new UVector32(status);
815 }
816
817 ~TestParams() {
818 delete bi;
819 delete expectedBreaks;
820 delete srcLine;
821 delete srcCol;
822 utext_close(textToBreak);
823 delete textMap;
824 }
2ca993e8 825
b331163b
A
826 int32_t getSrcLine(int32_t bp);
827 int32_t getExpectedBreak(int32_t bp);
828 int32_t getSrcCol(int32_t bp);
829
830 void setUTF16(UErrorCode &status);
831 void setUTF8(UErrorCode &status);
73c04bcf
A
832};
833
b331163b
A
834// Append a UnicodeString to a CharString with UTF-8 encoding.
835// Substitute any invalid chars.
836// Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
837static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
838 if (U_FAILURE(status)) {
839 return;
840 }
841 int32_t utf8Length;
842 u_strToUTF8WithSub(NULL, 0, &utf8Length, // Output Buffer, NULL for preflight.
843 src.getBuffer(), src.length(), // UTF-16 data
844 0xfffd, NULL, // Substitution char, number of subs.
845 &status);
846 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
847 return;
848 }
849 status = U_ZERO_ERROR;
850 int32_t capacity;
851 char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
852 u_strToUTF8WithSub(buffer, utf8Length, NULL,
853 src.getBuffer(), src.length(),
854 0xfffd, NULL, &status);
855 dest.append(buffer, utf8Length, status);
856}
2ca993e8 857
b331163b
A
858
859void TestParams::setUTF16(UErrorCode &status) {
860 textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
861 textMap->removeAllElements();
862 for (int32_t i=0; i<dataToBreak.length(); i++) {
863 if (i == dataToBreak.getChar32Start(i)) {
864 textMap->addElement(i, status);
865 } else {
866 textMap->addElement(-1, status);
867 }
868 }
869 textMap->addElement(dataToBreak.length(), status);
870 U_ASSERT(dataToBreak.length() + 1 == textMap->size());
871}
872
873
874void TestParams::setUTF8(UErrorCode &status) {
875 if (U_FAILURE(status)) {
876 return;
877 }
878 utf8String.clear();
879 CharStringAppend(utf8String, dataToBreak, status);
880 textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
881 if (U_FAILURE(status)) {
882 return;
883 }
884
885 textMap->removeAllElements();
886 int32_t utf16Index = 0;
887 for (;;) {
888 textMap->addElement(utf16Index, status);
889 UChar32 c32 = utext_current32(textToBreak);
890 if (c32 < 0) {
891 break;
892 }
893 utf16Index += U16_LENGTH(c32);
894 utext_next32(textToBreak);
895 while (textMap->size() < utext_getNativeIndex(textToBreak)) {
896 textMap->addElement(-1, status);
897 }
898 }
899 U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
900}
901
902
f3c0d7a5 903int32_t TestParams::getSrcLine(int32_t bp) {
b331163b
A
904 if (bp >= textMap->size()) {
905 bp = textMap->size() - 1;
906 }
907 int32_t i = 0;
908 for(; bp >= 0 ; --bp) {
909 // Move to a character boundary if we are not on one already.
910 i = textMap->elementAti(bp);
911 if (i >= 0) {
912 break;
913 }
914 }
915 return srcLine->elementAti(i);
916}
917
918
f3c0d7a5 919int32_t TestParams::getExpectedBreak(int32_t bp) {
b331163b
A
920 if (bp >= textMap->size()) {
921 return 0;
922 }
923 int32_t i = textMap->elementAti(bp);
924 int32_t retVal = 0;
925 if (i >= 0) {
926 retVal = expectedBreaks->elementAti(i);
927 }
928 return retVal;
929}
930
931
f3c0d7a5 932int32_t TestParams::getSrcCol(int32_t bp) {
b331163b
A
933 if (bp >= textMap->size()) {
934 bp = textMap->size() - 1;
935 }
936 int32_t i = 0;
937 for(; bp >= 0; --bp) {
938 // Move bp to a character boundary if we are not on one already.
939 i = textMap->elementAti(bp);
940 if (i >= 0) {
941 break;
942 }
943 }
944 return srcCol->elementAti(i);
945}
946
947
948void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
73c04bcf
A
949 int32_t bp;
950 int32_t prevBP;
951 int32_t i;
952
b331163b
A
953 TEST_ASSERT_SUCCESS(status);
954 if (U_FAILURE(status)) {
955 return;
956 }
957
73c04bcf
A
958 if (t->bi == NULL) {
959 return;
960 }
961
b331163b 962 t->bi->setText(t->textToBreak, status);
73c04bcf
A
963 //
964 // Run the iterator forward
965 //
966 prevBP = -1;
967 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
968 if (prevBP == bp) {
969 // Fail for lack of forward progress.
970 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
b331163b 971 bp, t->getSrcLine(bp), t->getSrcCol(bp));
73c04bcf
A
972 break;
973 }
974
b331163b 975 // Check that there we didn't miss an expected break between the last one
73c04bcf
A
976 // and this one.
977 for (i=prevBP+1; i<bp; i++) {
b331163b 978 if (t->getExpectedBreak(i) != 0) {
73c04bcf
A
979 int expected[] = {0, i};
980 printStringBreaks(t->dataToBreak, expected, 2);
981 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
b331163b 982 i, t->getSrcLine(i), t->getSrcCol(i));
73c04bcf
A
983 }
984 }
985
986 // Check that the break we did find was expected
b331163b 987 if (t->getExpectedBreak(bp) == 0) {
73c04bcf 988 int expected[] = {0, bp};
b331163b 989 printStringBreaks(t->textToBreak, expected, 2);
73c04bcf 990 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
b331163b 991 bp, t->getSrcLine(bp), t->getSrcCol(bp));
73c04bcf
A
992 } else {
993 // The break was expected.
994 // Check that the {nnn} tag value is correct.
b331163b 995 int32_t expectedTagVal = t->getExpectedBreak(bp);
73c04bcf
A
996 if (expectedTagVal == -1) {
997 expectedTagVal = 0;
998 }
b331163b 999 int32_t line = t->getSrcLine(bp);
f3c0d7a5 1000 int32_t rs = t->bi->getRuleStatus();
73c04bcf
A
1001 if (rs != expectedTagVal) {
1002 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
1003 " Actual, Expected status = %4d, %4d",
b331163b 1004 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
73c04bcf
A
1005 }
1006 }
1007
73c04bcf
A
1008 prevBP = bp;
1009 }
1010
1011 // Verify that there were no missed expected breaks after the last one found
b331163b
A
1012 for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
1013 if (t->getExpectedBreak(i) != 0) {
73c04bcf 1014 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
b331163b 1015 i, t->getSrcLine(i), t->getSrcCol(i));
73c04bcf
A
1016 }
1017 }
1018
1019 //
1020 // Run the iterator backwards, verify that the same breaks are found.
1021 //
b331163b 1022 prevBP = utext_nativeLength(t->textToBreak)+2; // start with a phony value for the last break pos seen.
73c04bcf
A
1023 for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
1024 if (prevBP == bp) {
1025 // Fail for lack of progress.
1026 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
b331163b 1027 bp, t->getSrcLine(bp), t->getSrcCol(bp));
73c04bcf
A
1028 break;
1029 }
1030
b331163b 1031 // Check that we didn't miss an expected break between the last one
73c04bcf
A
1032 // and this one. (UVector returns zeros for index out of bounds.)
1033 for (i=prevBP-1; i>bp; i--) {
b331163b
A
1034 if (t->getExpectedBreak(i) != 0) {
1035 errln("Reverse Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1036 i, t->getSrcLine(i), t->getSrcCol(i));
73c04bcf
A
1037 }
1038 }
1039
1040 // Check that the break we did find was expected
b331163b 1041 if (t->getExpectedBreak(bp) == 0) {
73c04bcf 1042 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
b331163b 1043 bp, t->getSrcLine(bp), t->getSrcCol(bp));
73c04bcf
A
1044 } else {
1045 // The break was expected.
1046 // Check that the {nnn} tag value is correct.
b331163b 1047 int32_t expectedTagVal = t->getExpectedBreak(bp);
73c04bcf
A
1048 if (expectedTagVal == -1) {
1049 expectedTagVal = 0;
1050 }
b331163b
A
1051 int line = t->getSrcLine(bp);
1052 int32_t rs = t->bi->getRuleStatus();
73c04bcf
A
1053 if (rs != expectedTagVal) {
1054 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
1055 " Actual, Expected status = %4d, %4d",
b331163b 1056 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
73c04bcf
A
1057 }
1058 }
1059
1060 prevBP = bp;
1061 }
1062
1063 // Verify that there were no missed breaks prior to the last one found
1064 for (i=prevBP-1; i>=0; i--) {
b331163b 1065 if (t->getExpectedBreak(i) != 0) {
73c04bcf 1066 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
b331163b 1067 i, t->getSrcLine(i), t->getSrcCol(i));
73c04bcf
A
1068 }
1069 }
51004dcb
A
1070
1071 // Check isBoundary()
b331163b
A
1072 for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
1073 UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
51004dcb
A
1074 UBool boundaryFound = t->bi->isBoundary(i);
1075 if (boundaryExpected != boundaryFound) {
1076 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
1077 " Expected, Actual= %s, %s",
b331163b 1078 i, t->getSrcLine(i), t->getSrcCol(i),
51004dcb
A
1079 boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
1080 }
1081 }
1082
1083 // Check following()
b331163b 1084 for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
51004dcb
A
1085 int32_t actualBreak = t->bi->following(i);
1086 int32_t expectedBreak = BreakIterator::DONE;
b331163b
A
1087 for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) {
1088 if (t->getExpectedBreak(j) != 0) {
51004dcb
A
1089 expectedBreak = j;
1090 break;
1091 }
1092 }
1093 if (expectedBreak != actualBreak) {
1094 errln("following(%d) incorrect. File line,col= %4d,%4d\n"
1095 " Expected, Actual= %d, %d",
b331163b 1096 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
51004dcb
A
1097 }
1098 }
1099
1100 // Check preceding()
b331163b 1101 for (i=utext_nativeLength(t->textToBreak); i>=0; i--) {
51004dcb
A
1102 int32_t actualBreak = t->bi->preceding(i);
1103 int32_t expectedBreak = BreakIterator::DONE;
1104
b331163b
A
1105 // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
1106 // preceding(trailing byte) will return the index of some preceding code point,
1107 // not the lead byte of the current code point, even though that has a smaller index.
1108 // Therefore, start looking at the expected break data not at i-1, but at
1109 // the start of code point index - 1.
1110 utext_setNativeIndex(t->textToBreak, i);
1111 int32_t j = utext_getNativeIndex(t->textToBreak) - 1;
1112 for (; j >= 0; j--) {
1113 if (t->getExpectedBreak(j) != 0) {
51004dcb
A
1114 expectedBreak = j;
1115 break;
1116 }
1117 }
1118 if (expectedBreak != actualBreak) {
1119 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
1120 " Expected, Actual= %d, %d",
b331163b 1121 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
51004dcb
A
1122 }
1123 }
73c04bcf
A
1124}
1125
1126
1127void RBBITest::TestExtended() {
f3c0d7a5
A
1128 // Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This
1129 // data driven test closely entangles filtered and regular data.
1130#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION
73c04bcf
A
1131 UErrorCode status = U_ZERO_ERROR;
1132 Locale locale("");
1133
1134 UnicodeString rules;
b331163b 1135 TestParams tp(status);
73c04bcf 1136
2ca993e8 1137 RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
729e4ab9
A
1138 if (U_FAILURE(status)) {
1139 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
1140 }
73c04bcf
A
1141
1142
1143 //
1144 // Open and read the test data file.
1145 //
1146 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1147 char testFileName[1000];
1148 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1149 errln("Can't open test data. Path too long.");
1150 return;
1151 }
1152 strcpy(testFileName, testDataDirectory);
1153 strcat(testFileName, "rbbitst.txt");
1154
1155 int len;
46f4442e 1156 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
73c04bcf
A
1157 if (U_FAILURE(status)) {
1158 return; /* something went wrong, error already output */
1159 }
1160
1161
2ca993e8 1162 bool skipTest = false; // Skip this test?
46f4442e 1163
73c04bcf
A
1164 //
1165 // Put the test data into a UnicodeString
1166 //
1167 UnicodeString testString(FALSE, testFile, len);
1168
1169 enum EParseState{
1170 PARSE_COMMENT,
1171 PARSE_TAG,
1172 PARSE_DATA,
1173 PARSE_NUM
1174 }
1175 parseState = PARSE_TAG;
1176
1177 EParseState savedState = PARSE_TAG;
1178
73c04bcf
A
1179 int32_t lineNum = 1;
1180 int32_t colStart = 0;
1181 int32_t column = 0;
1182 int32_t charIdx = 0;
1183
1184 int32_t tagValue = 0; // The numeric value of a <nnn> tag.
1185
1186 for (charIdx = 0; charIdx < len; ) {
1187 status = U_ZERO_ERROR;
1188 UChar c = testString.charAt(charIdx);
1189 charIdx++;
f3c0d7a5 1190 if (c == u'\r' && charIdx<len && testString.charAt(charIdx) == u'\n') {
73c04bcf 1191 // treat CRLF as a unit
f3c0d7a5 1192 c = u'\n';
73c04bcf
A
1193 charIdx++;
1194 }
f3c0d7a5 1195 if (c == u'\n' || c == u'\r') {
73c04bcf
A
1196 lineNum++;
1197 colStart = charIdx;
1198 }
1199 column = charIdx - colStart + 1;
1200
1201 switch (parseState) {
1202 case PARSE_COMMENT:
f3c0d7a5 1203 if (c == u'\n' || c == u'\r') {
73c04bcf
A
1204 parseState = savedState;
1205 }
1206 break;
1207
1208 case PARSE_TAG:
1209 {
f3c0d7a5 1210 if (c == u'#') {
73c04bcf
A
1211 parseState = PARSE_COMMENT;
1212 savedState = PARSE_TAG;
1213 break;
1214 }
1215 if (u_isUWhiteSpace(c)) {
1216 break;
1217 }
1218 if (testString.compare(charIdx-1, 6, "<word>") == 0) {
1219 delete tp.bi;
1220 tp.bi = BreakIterator::createWordInstance(locale, status);
2ca993e8 1221 skipTest = false;
73c04bcf
A
1222 charIdx += 5;
1223 break;
1224 }
1225 if (testString.compare(charIdx-1, 6, "<char>") == 0) {
1226 delete tp.bi;
1227 tp.bi = BreakIterator::createCharacterInstance(locale, status);
2ca993e8 1228 skipTest = false;
73c04bcf
A
1229 charIdx += 5;
1230 break;
1231 }
1232 if (testString.compare(charIdx-1, 6, "<line>") == 0) {
1233 delete tp.bi;
1234 tp.bi = BreakIterator::createLineInstance(locale, status);
2ca993e8 1235 skipTest = false;
73c04bcf
A
1236 charIdx += 5;
1237 break;
1238 }
1239 if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
1240 delete tp.bi;
46f4442e 1241 tp.bi = BreakIterator::createSentenceInstance(locale, status);
2ca993e8 1242 skipTest = false;
73c04bcf
A
1243 charIdx += 5;
1244 break;
1245 }
1246 if (testString.compare(charIdx-1, 7, "<title>") == 0) {
1247 delete tp.bi;
1248 tp.bi = BreakIterator::createTitleInstance(locale, status);
1249 charIdx += 6;
1250 break;
1251 }
46f4442e 1252
73c04bcf
A
1253 // <locale loc_name>
1254 localeMatcher.reset(testString);
1255 if (localeMatcher.lookingAt(charIdx-1, status)) {
1256 UnicodeString localeName = localeMatcher.group(1, status);
1257 char localeName8[100];
1258 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
1259 locale = Locale::createFromName(localeName8);
51004dcb 1260 charIdx += localeMatcher.group(0, status).length() - 1;
73c04bcf
A
1261 TEST_ASSERT_SUCCESS(status);
1262 break;
1263 }
1264 if (testString.compare(charIdx-1, 6, "<data>") == 0) {
1265 parseState = PARSE_DATA;
1266 charIdx += 5;
1267 tp.dataToBreak = "";
1268 tp.expectedBreaks->removeAllElements();
1269 tp.srcCol ->removeAllElements();
1270 tp.srcLine->removeAllElements();
1271 break;
1272 }
1273
1274 errln("line %d: Tag expected in test file.", lineNum);
73c04bcf
A
1275 parseState = PARSE_COMMENT;
1276 savedState = PARSE_DATA;
46f4442e 1277 goto end_test; // Stop the test.
73c04bcf
A
1278 }
1279 break;
1280
1281 case PARSE_DATA:
f3c0d7a5 1282 if (c == u'\u2022') { // u'•'
73c04bcf
A
1283 int32_t breakIdx = tp.dataToBreak.length();
1284 tp.expectedBreaks->setSize(breakIdx+1);
1285 tp.expectedBreaks->setElementAt(-1, breakIdx);
1286 tp.srcLine->setSize(breakIdx+1);
1287 tp.srcLine->setElementAt(lineNum, breakIdx);
1288 tp.srcCol ->setSize(breakIdx+1);
1289 tp.srcCol ->setElementAt(column, breakIdx);
1290 break;
1291 }
1292
1293 if (testString.compare(charIdx-1, 7, "</data>") == 0) {
1294 // Add final entry to mappings from break location to source file position.
1295 // Need one extra because last break position returned is after the
1296 // last char in the data, not at the last char.
1297 tp.srcLine->addElement(lineNum, status);
1298 tp.srcCol ->addElement(column, status);
1299
1300 parseState = PARSE_TAG;
1301 charIdx += 6;
1302
2ca993e8
A
1303 if (!skipTest) {
1304 // RUN THE TEST!
1305 status = U_ZERO_ERROR;
1306 tp.setUTF16(status);
1307 executeTest(&tp, status);
1308 TEST_ASSERT_SUCCESS(status);
1309
1310 // Run again, this time with UTF-8 text wrapped in a UText.
1311 status = U_ZERO_ERROR;
1312 tp.setUTF8(status);
1313 TEST_ASSERT_SUCCESS(status);
1314 executeTest(&tp, status);
1315 }
73c04bcf
A
1316 break;
1317 }
1318
46f4442e 1319 if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
73c04bcf
A
1320 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1321 // Get the code point from the name and insert it into the test data.
1322 // (Damn, no API takes names in Unicode !!!
1323 // we've got to take it back to char *)
f3c0d7a5 1324 int32_t nameEndIdx = testString.indexOf(u'}', charIdx);
73c04bcf
A
1325 int32_t nameLength = nameEndIdx - (charIdx+2);
1326 char charNameBuf[200];
1327 UChar32 theChar = -1;
1328 if (nameEndIdx != -1) {
1329 UErrorCode status = U_ZERO_ERROR;
1330 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1331 charNameBuf[sizeof(charNameBuf)-1] = 0;
1332 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1333 if (U_FAILURE(status)) {
1334 theChar = -1;
1335 }
1336 }
1337 if (theChar == -1) {
1338 errln("Error in named character in test file at line %d, col %d",
1339 lineNum, column);
1340 } else {
1341 // Named code point was recognized. Insert it
1342 // into the test data.
1343 tp.dataToBreak.append(theChar);
1344 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1345 tp.srcLine->addElement(lineNum, status);
1346 tp.srcCol ->addElement(column, status);
1347 }
1348 }
1349 if (nameEndIdx > charIdx) {
1350 charIdx = nameEndIdx+1;
1351
1352 }
1353 break;
1354 }
1355
1356
1357
1358
1359 if (testString.compare(charIdx-1, 2, "<>") == 0) {
1360 charIdx++;
1361 int32_t breakIdx = tp.dataToBreak.length();
1362 tp.expectedBreaks->setSize(breakIdx+1);
1363 tp.expectedBreaks->setElementAt(-1, breakIdx);
1364 tp.srcLine->setSize(breakIdx+1);
1365 tp.srcLine->setElementAt(lineNum, breakIdx);
1366 tp.srcCol ->setSize(breakIdx+1);
1367 tp.srcCol ->setElementAt(column, breakIdx);
1368 break;
1369 }
1370
f3c0d7a5 1371 if (c == u'<') {
73c04bcf
A
1372 tagValue = 0;
1373 parseState = PARSE_NUM;
1374 break;
1375 }
1376
f3c0d7a5 1377 if (c == u'#' && column==3) { // TODO: why is column off so far?
73c04bcf
A
1378 parseState = PARSE_COMMENT;
1379 savedState = PARSE_DATA;
1380 break;
1381 }
1382
f3c0d7a5 1383 if (c == u'\\') {
73c04bcf
A
1384 // Check for \ at end of line, a line continuation.
1385 // Advance over (discard) the newline
1386 UChar32 cp = testString.char32At(charIdx);
f3c0d7a5 1387 if (cp == u'\r' && charIdx<len && testString.charAt(charIdx+1) == u'\n') {
73c04bcf
A
1388 // We have a CR LF
1389 // Need an extra increment of the input ptr to move over both of them
1390 charIdx++;
1391 }
f3c0d7a5 1392 if (cp == u'\n' || cp == u'\r') {
73c04bcf
A
1393 lineNum++;
1394 colStart = charIdx;
1395 charIdx++;
1396 break;
1397 }
1398
1399 // Let unescape handle the back slash.
1400 cp = testString.unescapeAt(charIdx);
1401 if (cp != -1) {
1402 // Escape sequence was recognized. Insert the char
1403 // into the test data.
1404 tp.dataToBreak.append(cp);
1405 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1406 tp.srcLine->addElement(lineNum, status);
1407 tp.srcCol ->addElement(column, status);
1408 }
1409 break;
1410 }
1411
1412
1413 // Not a recognized backslash escape sequence.
1414 // Take the next char as a literal.
1415 // TODO: Should this be an error?
1416 c = testString.charAt(charIdx);
1417 charIdx = testString.moveIndex32(charIdx, 1);
1418 }
1419
1420 // Normal, non-escaped data char.
1421 tp.dataToBreak.append(c);
1422
1423 // Save the mapping from offset in the data to line/column numbers in
1424 // the original input file. Will be used for better error messages only.
1425 // If there's an expected break before this char, the slot in the mapping
1426 // vector will already be set for this char; don't overwrite it.
1427 if (tp.dataToBreak.length() > tp.srcLine->size()) {
1428 tp.srcLine->addElement(lineNum, status);
1429 tp.srcCol ->addElement(column, status);
1430 }
1431 break;
1432
1433
1434 case PARSE_NUM:
1435 // We are parsing an expected numeric tag value, like <1234>,
1436 // within a chunk of data.
1437 if (u_isUWhiteSpace(c)) {
1438 break;
1439 }
1440
f3c0d7a5 1441 if (c == u'>') {
73c04bcf
A
1442 // Finished the number. Add the info to the expected break data,
1443 // and switch parse state back to doing plain data.
1444 parseState = PARSE_DATA;
1445 if (tagValue == 0) {
1446 tagValue = -1;
1447 }
1448 int32_t breakIdx = tp.dataToBreak.length();
1449 tp.expectedBreaks->setSize(breakIdx+1);
1450 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1451 tp.srcLine->setSize(breakIdx+1);
1452 tp.srcLine->setElementAt(lineNum, breakIdx);
1453 tp.srcCol ->setSize(breakIdx+1);
1454 tp.srcCol ->setElementAt(column, breakIdx);
1455 break;
1456 }
1457
1458 if (u_isdigit(c)) {
1459 tagValue = tagValue*10 + u_charDigitValue(c);
1460 break;
1461 }
1462
1463 errln("Syntax Error in test file at line %d, col %d",
1464 lineNum, column);
73c04bcf 1465 parseState = PARSE_COMMENT;
46f4442e 1466 goto end_test; // Stop the test
73c04bcf
A
1467 break;
1468 }
1469
1470
1471 if (U_FAILURE(status)) {
4388f060 1472 dataerrln("ICU Error %s while parsing test file at line %d.",
73c04bcf 1473 u_errorName(status), lineNum);
73c04bcf 1474 status = U_ZERO_ERROR;
46f4442e 1475 goto end_test; // Stop the test
73c04bcf
A
1476 }
1477
1478 }
1479
1480end_test:
73c04bcf
A
1481 delete [] testFile;
1482#endif
1483}
1484
729e4ab9
A
1485
1486//-------------------------------------------------------------------------------
1487//
1488// TestDictRules create a break iterator from source rules that includes a
1489// dictionary range. Regression for bug #7130. Source rules
1490// do not declare a break iterator type (word, line, sentence, etc.
1491// but the dictionary code, without a type, would loop.
1492//
1493//-------------------------------------------------------------------------------
1494void RBBITest::TestDictRules() {
1495 const char *rules = "$dictionary = [a-z]; \n"
1496 "!!forward; \n"
1497 "$dictionary $dictionary; \n"
1498 "!!reverse; \n"
1499 "$dictionary $dictionary; \n";
1500 const char *text = "aa";
1501 UErrorCode status = U_ZERO_ERROR;
1502 UParseError parseError;
1503
1504 RuleBasedBreakIterator bi(rules, parseError, status);
1505 if (U_SUCCESS(status)) {
1506 UnicodeString utext = text;
1507 bi.setText(utext);
1508 int32_t position;
1509 int32_t loops;
1510 for (loops = 0; loops<10; loops++) {
1511 position = bi.next();
1512 if (position == RuleBasedBreakIterator::DONE) {
1513 break;
1514 }
1515 }
1516 TEST_ASSERT(loops == 1);
1517 } else {
1518 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1519 }
1520}
1521
1522
73c04bcf
A
1523
1524//-------------------------------------------------------------------------------
1525//
1526// ReadAndConvertFile Read a text data file, convert it to UChars, and
2ca993e8 1527// return the data in one big UChar * buffer, which the caller must delete.
73c04bcf 1528//
46f4442e
A
1529// parameters:
1530// fileName: the name of the file, with no directory part. The test data directory
1531// is assumed.
1532// ulen an out parameter, receives the actual length (in UChars) of the file data.
1533// encoding The file encoding. If the file contains a BOM, that will override the encoding
1534// specified here. The BOM, if it exists, will be stripped from the returned data.
1535// Pass NULL for the system default encoding.
1536// status
1537// returns:
1538// The file data, converted to UChar.
1539// The caller must delete this when done with
1540// delete [] theBuffer;
1541//
73c04bcf
A
1542// TODO: This is a clone of RegexTest::ReadAndConvertFile.
1543// Move this function to some common place.
1544//
1545//--------------------------------------------------------------------------------
46f4442e 1546UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
73c04bcf
A
1547 UChar *retPtr = NULL;
1548 char *fileBuf = NULL;
1549 UConverter* conv = NULL;
1550 FILE *f = NULL;
1551
1552 ulen = 0;
1553 if (U_FAILURE(status)) {
1554 return retPtr;
1555 }
1556
1557 //
1558 // Open the file.
1559 //
1560 f = fopen(fileName, "rb");
1561 if (f == 0) {
729e4ab9 1562 dataerrln("Error opening test data file %s\n", fileName);
73c04bcf
A
1563 status = U_FILE_ACCESS_ERROR;
1564 return NULL;
1565 }
1566 //
1567 // Read it in
1568 //
1569 int fileSize;
1570 int amt_read;
1571
1572 fseek( f, 0, SEEK_END);
1573 fileSize = ftell(f);
1574 fileBuf = new char[fileSize];
1575 fseek(f, 0, SEEK_SET);
1576 amt_read = fread(fileBuf, 1, fileSize, f);
1577 if (amt_read != fileSize || fileSize <= 0) {
1578 errln("Error reading test data file.");
1579 goto cleanUpAndReturn;
1580 }
1581
1582 //
1583 // Look for a Unicode Signature (BOM) on the data just read
1584 //
1585 int32_t signatureLength;
1586 const char * fileBufC;
46f4442e 1587 const char* bomEncoding;
73c04bcf
A
1588
1589 fileBufC = fileBuf;
46f4442e 1590 bomEncoding = ucnv_detectUnicodeSignature(
73c04bcf 1591 fileBuf, fileSize, &signatureLength, &status);
46f4442e 1592 if(bomEncoding!=NULL ){
73c04bcf
A
1593 fileBufC += signatureLength;
1594 fileSize -= signatureLength;
46f4442e 1595 encoding = bomEncoding;
73c04bcf
A
1596 }
1597
1598 //
1599 // Open a converter to take the rule file to UTF-16
1600 //
1601 conv = ucnv_open(encoding, &status);
1602 if (U_FAILURE(status)) {
1603 goto cleanUpAndReturn;
1604 }
1605
1606 //
1607 // Convert the rules to UChar.
1608 // Preflight first to determine required buffer size.
1609 //
1610 ulen = ucnv_toUChars(conv,
1611 NULL, // dest,
1612 0, // destCapacity,
1613 fileBufC,
1614 fileSize,
1615 &status);
1616 if (status == U_BUFFER_OVERFLOW_ERROR) {
1617 // Buffer Overflow is expected from the preflight operation.
1618 status = U_ZERO_ERROR;
1619
1620 retPtr = new UChar[ulen+1];
1621 ucnv_toUChars(conv,
1622 retPtr, // dest,
1623 ulen+1,
1624 fileBufC,
1625 fileSize,
1626 &status);
1627 }
1628
1629cleanUpAndReturn:
1630 fclose(f);
1631 delete []fileBuf;
1632 ucnv_close(conv);
1633 if (U_FAILURE(status)) {
1634 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4388f060 1635 delete []retPtr;
73c04bcf
A
1636 retPtr = 0;
1637 ulen = 0;
1638 };
1639 return retPtr;
1640}
1641
1642
73c04bcf 1643
46f4442e 1644//--------------------------------------------------------------------------------------------
73c04bcf 1645//
46f4442e 1646// Run tests from each of the boundary test data files distributed by the Unicode Consortium
73c04bcf 1647//
46f4442e
A
1648//-------------------------------------------------------------------------------------------
1649void RBBITest::TestUnicodeFiles() {
1650 RuleBasedBreakIterator *bi;
1651 UErrorCode status = U_ZERO_ERROR;
73c04bcf 1652
729e4ab9 1653 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
46f4442e
A
1654 TEST_ASSERT_SUCCESS(status);
1655 if (U_SUCCESS(status)) {
1656 runUnicodeTestData("GraphemeBreakTest.txt", bi);
1657 }
1658 delete bi;
73c04bcf 1659
729e4ab9 1660 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
46f4442e
A
1661 TEST_ASSERT_SUCCESS(status);
1662 if (U_SUCCESS(status)) {
1663 runUnicodeTestData("WordBreakTest.txt", bi);
1664 }
1665 delete bi;
73c04bcf 1666
729e4ab9 1667 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
46f4442e
A
1668 TEST_ASSERT_SUCCESS(status);
1669 if (U_SUCCESS(status)) {
1670 runUnicodeTestData("SentenceBreakTest.txt", bi);
1671 }
1672 delete bi;
73c04bcf 1673
729e4ab9 1674 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
46f4442e
A
1675 TEST_ASSERT_SUCCESS(status);
1676 if (U_SUCCESS(status)) {
1677 runUnicodeTestData("LineBreakTest.txt", bi);
73c04bcf 1678 }
46f4442e 1679 delete bi;
73c04bcf
A
1680}
1681
1682
b331163b
A
1683// Check for test cases from the Unicode test data files that are known to fail
1684// and should be skipped because ICU is not yet able to fully implement the spec.
1685// See ticket #7270.
1686
1687UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
f3c0d7a5
A
1688 static struct TestCase {
1689 const char *fFileName;
1690 const UChar *fString;
1691 } badTestCases[] = { // Line Numbers from Unicode 7.0.0 file.
1692 {"LineBreakTest.txt", u"\u200B\u0020}"}, // Line 5198
1693 {"LineBreakTest.txt", u"\u200B\u0020)"}, // Line 5202
1694 {"LineBreakTest.txt", u"\u200B\u0020!"}, // Line 5214
1695 {"LineBreakTest.txt", u"\u200B\u0020,"}, // Line 5246
1696 {"LineBreakTest.txt", u"\u200B\u0020/"}, // Line 5298
1697 {"LineBreakTest.txt", u"\u200B\u0020\u2060"}, // Line 5302
1698 // Line Numbers from pre-release verion of GraphemeBreakTest-10.0.0.txt
1699 {"GraphemeBreakTest.txt", u"\u200D\u2640"}, // Line 656, old GB 11 test ZWJ x GAZ
1700 {"GraphemeBreakTest.txt", u"\u200D\U0001F466"}, // Line 658, old GB 11 test ZWJ x EBG
1701 {"GraphemeBreakTest.txt", u"\u200D\U0001F466\U0001F3FB"}, // Line 842, old GB 11 test ZWJ x EBG x EModifier
1702
1703 // Line Numbers from pre-release verion of WordBreakTest-10.0.0.txt
1704 {"WordBreakTest.txt", u"\u200D\u261D"}, // Line 1356, ZWJ x EmojiNRK
1705 {"WordBreakTest.txt", u"\u200D\U0001F3FB"}, // Line 1358, ZWJ x EmojiNRK
b331163b 1706 };
b331163b 1707
f3c0d7a5
A
1708 for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
1709 const TestCase &badCase = badTestCases[n];
1710 if (!strcmp(fileName, badCase.fFileName) &&
1711 testCase == UnicodeString(badCase.fString)) {
b331163b
A
1712 return logKnownIssue("7270");
1713 }
1714 }
1715 return FALSE;
1716}
1717
1718
46f4442e
A
1719//--------------------------------------------------------------------------------------------
1720//
1721// Run tests from one of the boundary test data files distributed by the Unicode Consortium
1722//
1723//-------------------------------------------------------------------------------------------
1724void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1725#if !UCONFIG_NO_REGULAR_EXPRESSIONS
1726 UErrorCode status = U_ZERO_ERROR;
73c04bcf 1727
46f4442e
A
1728 //
1729 // Open and read the test data file, put it into a UnicodeString.
1730 //
1731 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1732 char testFileName[1000];
1733 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
729e4ab9 1734 dataerrln("Can't open test data. Path too long.");
73c04bcf
A
1735 return;
1736 }
46f4442e
A
1737 strcpy(testFileName, testDataDirectory);
1738 strcat(testFileName, fileName);
2ca993e8 1739
46f4442e 1740 logln("Opening data file %s\n", fileName);
73c04bcf 1741
46f4442e
A
1742 int len;
1743 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1744 if (status != U_FILE_ACCESS_ERROR) {
1745 TEST_ASSERT_SUCCESS(status);
1746 TEST_ASSERT(testFile != NULL);
1747 }
1748 if (U_FAILURE(status) || testFile == NULL) {
1749 return; /* something went wrong, error already output */
1750 }
1751 UnicodeString testFileAsString(TRUE, testFile, len);
73c04bcf 1752
46f4442e
A
1753 //
1754 // Parse the test data file using a regular expression.
1755 // Each kind of token is recognized in its own capture group; what type of item was scanned
1756 // is identified by which group had a match.
1757 //
1758 // Caputure Group # 1 2 3 4 5
1759 // Parses this item: divide x hex digits comment \n unrecognized \n
1760 //
1761 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1762 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1763 UnicodeString testString;
1764 UVector32 breakPositions(status);
1765 int lineNumber = 1;
1766 TEST_ASSERT_SUCCESS(status);
1767 if (U_FAILURE(status)) {
73c04bcf
A
1768 return;
1769 }
1770
46f4442e
A
1771 //
1772 // Scan through each test case, building up the string to be broken in testString,
1773 // and the positions that should be boundaries in the breakPositions vector.
1774 //
729e4ab9 1775 int spin = 0;
46f4442e 1776 while (tokenMatcher.find()) {
729e4ab9
A
1777 if(tokenMatcher.hitEnd()) {
1778 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for.
1779 This occurred when the text file was corrupt (wasn't marked as UTF-8)
1780 and caused an infinite loop here on EBCDIC systems!
1781 */
1782 fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1783 // return;
1784 }
46f4442e
A
1785 if (tokenMatcher.start(1, status) >= 0) {
1786 // Scanned a divide sign, indicating a break position in the test data.
1787 if (testString.length()>0) {
1788 breakPositions.addElement(testString.length(), status);
73c04bcf 1789 }
46f4442e
A
1790 }
1791 else if (tokenMatcher.start(2, status) >= 0) {
1792 // Scanned an 'x', meaning no break at this position in the test data
1793 // Nothing to be done here.
1794 }
1795 else if (tokenMatcher.start(3, status) >= 0) {
1796 // Scanned Hex digits. Convert them to binary, append to the character data string.
1797 const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1798 int length = hexNumber.length();
1799 if (length<=8) {
1800 char buf[10];
1801 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1802 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1803 if (c<=0x10ffff) {
1804 testString.append(c);
1805 } else {
1806 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1807 fileName, lineNumber);
1808 }
1809 } else {
1810 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1811 fileName, lineNumber);
1812 }
1813 }
1814 else if (tokenMatcher.start(4, status) >= 0) {
1815 // Scanned to end of a line, possibly skipping over a comment in the process.
1816 // If the line from the file contained test data, run the test now.
2ca993e8 1817 if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
46f4442e 1818 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
73c04bcf
A
1819 }
1820
46f4442e
A
1821 // Clear out this test case.
1822 // The string and breakPositions vector will be refilled as the next
1823 // test case is parsed.
1824 testString.remove();
1825 breakPositions.removeAllElements();
1826 lineNumber++;
1827 } else {
1828 // Scanner catchall. Something unrecognized appeared on the line.
1829 char token[16];
1830 UnicodeString uToken = tokenMatcher.group(0, status);
1831 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1832 token[sizeof(token)-1] = 0;
1833 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token);
1834
1835 // Clean up, in preparation for continuing with the next line.
1836 testString.remove();
1837 breakPositions.removeAllElements();
1838 lineNumber++;
1839 }
1840 TEST_ASSERT_SUCCESS(status);
1841 if (U_FAILURE(status)) {
73c04bcf
A
1842 break;
1843 }
46f4442e 1844 }
73c04bcf 1845
46f4442e
A
1846 delete [] testFile;
1847 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
1848}
73c04bcf 1849
46f4442e
A
1850//--------------------------------------------------------------------------------------------
1851//
1852// checkUnicodeTestCase() Run one test case from one of the Unicode Consortium
1853// test data files. Do only a simple, forward-only check -
1854// this test is mostly to check that ICU and the Unicode
1855// data agree with each other.
1856//
1857//--------------------------------------------------------------------------------------------
1858void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1859 const UnicodeString &testString, // Text data to be broken
1860 UVector32 *breakPositions, // Positions where breaks should be found.
1861 RuleBasedBreakIterator *bi) {
1862 int32_t pos; // Break Position in the test string
1863 int32_t expectedI = 0; // Index of expected break position in the vector of expected results.
1864 int32_t expectedPos; // Expected break position (index into test string)
1865
1866 bi->setText(testString);
1867 pos = bi->first();
1868 pos = bi->next();
1869
1870 while (pos != BreakIterator::DONE) {
1871 if (expectedI >= breakPositions->size()) {
1872 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1873 testFileName, lineNumber, pos);
1874 break;
73c04bcf 1875 }
46f4442e
A
1876 expectedPos = breakPositions->elementAti(expectedI);
1877 if (pos < expectedPos) {
1878 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1879 testFileName, lineNumber, pos);
1880 break;
1881 }
1882 if (pos > expectedPos) {
1883 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1884 testFileName, lineNumber, expectedPos);
73c04bcf
A
1885 break;
1886 }
46f4442e
A
1887 pos = bi->next();
1888 expectedI++;
1889 }
73c04bcf 1890
46f4442e
A
1891 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1892 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1893 testFileName, lineNumber, breakPositions->elementAti(expectedI));
73c04bcf 1894 }
46f4442e 1895}
73c04bcf 1896
73c04bcf 1897
73c04bcf
A
1898
1899#if !UCONFIG_NO_REGULAR_EXPRESSIONS
73c04bcf
A
1900//---------------------------------------------------------------------------------------
1901//
1902// classs RBBIMonkeyKind
1903//
1904// Monkey Test for Break Iteration
1905// Abstract interface class. Concrete derived classes independently
1906// implement the break rules for different iterator types.
1907//
1908// The Monkey Test itself uses doesn't know which type of break iterator it is
1909// testing, but works purely in terms of the interface defined here.
1910//
1911//---------------------------------------------------------------------------------------
1912class RBBIMonkeyKind {
1913public:
1914 // Return a UVector of UnicodeSets, representing the character classes used
1915 // for this type of iterator.
1916 virtual UVector *charClasses() = 0;
1917
1918 // Set the test text on which subsequent calls to next() will operate
1919 virtual void setText(const UnicodeString &s) = 0;
1920
1921 // Find the next break postion, starting from the prev break position, or from zero.
1922 // Return -1 after reaching end of string.
1923 virtual int32_t next(int32_t i) = 0;
1924
1925 virtual ~RBBIMonkeyKind();
1926 UErrorCode deferredStatus;
1927
1928
1929protected:
1930 RBBIMonkeyKind();
1931
1932private:
1933};
1934
1935RBBIMonkeyKind::RBBIMonkeyKind() {
1936 deferredStatus = U_ZERO_ERROR;
1937}
1938
1939RBBIMonkeyKind::~RBBIMonkeyKind() {
1940}
1941
1942
1943//----------------------------------------------------------------------------------------
1944//
1945// Random Numbers. Similar to standard lib rand() and srand()
1946// Not using library to
1947// 1. Get same results on all platforms.
1948// 2. Get access to current seed, to more easily reproduce failures.
1949//
1950//---------------------------------------------------------------------------------------
1951static uint32_t m_seed = 1;
1952
1953static uint32_t m_rand()
1954{
1955 m_seed = m_seed * 1103515245 + 12345;
1956 return (uint32_t)(m_seed/65536) % 32768;
1957}
1958
1959
f3c0d7a5
A
1960//
1961// Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
1962//
1963static const char16_t *gExtended_Pict = u"["
1964 "\\U0001F774-\\U0001F77F\\U00002700-\\U00002701\\U00002703-\\U00002704\\U0000270E\\U00002710-\\U00002711\\U00002765-\\U00002767"
1965 "\\U0001F030-\\U0001F093\\U0001F094-\\U0001F09F\\U0001F10D-\\U0001F10F\\U0001F12F\\U0001F16C-\\U0001F16F\\U0001F1AD-\\U0001F1E5"
1966 "\\U0001F260-\\U0001F265\\U0001F203-\\U0001F20F\\U0001F23C-\\U0001F23F\\U0001F249-\\U0001F24F\\U0001F252-\\U0001F25F"
1967 "\\U0001F266-\\U0001F2FF\\U0001F7D5-\\U0001F7FF\\U0001F000-\\U0001F003\\U0001F005-\\U0001F02B\\U0001F02C-\\U0001F02F"
1968 "\\U0001F322-\\U0001F323\\U0001F394-\\U0001F395\\U0001F398\\U0001F39C-\\U0001F39D\\U0001F3F1-\\U0001F3F2\\U0001F3F6"
1969 "\\U0001F4FE\\U0001F53E-\\U0001F548\\U0001F54F\\U0001F568-\\U0001F56E\\U0001F571-\\U0001F572\\U0001F57B-\\U0001F586"
1970 "\\U0001F588-\\U0001F589\\U0001F58E-\\U0001F58F\\U0001F591-\\U0001F594\\U0001F597-\\U0001F5A3\\U0001F5A6-\\U0001F5A7"
1971 "\\U0001F5A9-\\U0001F5B0\\U0001F5B3-\\U0001F5BB\\U0001F5BD-\\U0001F5C1\\U0001F5C5-\\U0001F5D0\\U0001F5D4-\\U0001F5DB"
1972 "\\U0001F5DF-\\U0001F5E0\\U0001F5E2\\U0001F5E4-\\U0001F5E7\\U0001F5E9-\\U0001F5EE\\U0001F5F0-\\U0001F5F2\\U0001F5F4-\\U0001F5F9"
1973 "\\U00002605\\U00002607-\\U0000260D\\U0000260F-\\U00002610\\U00002612\\U00002616-\\U00002617\\U00002619-\\U0000261C"
1974 "\\U0000261E-\\U0000261F\\U00002621\\U00002624-\\U00002625\\U00002627-\\U00002629\\U0000262B-\\U0000262D\\U00002630-\\U00002637"
1975 "\\U0000263B-\\U00002647\\U00002654-\\U0000265F\\U00002661-\\U00002662\\U00002664\\U00002667\\U00002669-\\U0000267A"
1976 "\\U0000267C-\\U0000267E\\U00002680-\\U00002691\\U00002695\\U00002698\\U0000269A\\U0000269D-\\U0000269F\\U000026A2-\\U000026A9"
1977 "\\U000026AC-\\U000026AF\\U000026B2-\\U000026BC\\U000026BF-\\U000026C3\\U000026C6-\\U000026C7\\U000026C9-\\U000026CD"
1978 "\\U000026D0\\U000026D2\\U000026D5-\\U000026E8\\U000026EB-\\U000026EF\\U000026F6\\U000026FB-\\U000026FC\\U000026FE-\\U000026FF"
1979 "\\U00002388\\U0001FA00-\\U0001FFFD\\U0001F0A0-\\U0001F0AE\\U0001F0B1-\\U0001F0BF\\U0001F0C1-\\U0001F0CF\\U0001F0D1-\\U0001F0F5"
1980 "\\U0001F0AF-\\U0001F0B0\\U0001F0C0\\U0001F0D0\\U0001F0F6-\\U0001F0FF\\U0001F80C-\\U0001F80F\\U0001F848-\\U0001F84F"
1981 "\\U0001F85A-\\U0001F85F\\U0001F888-\\U0001F88F\\U0001F8AE-\\U0001F8FF\\U0001F900-\\U0001F90B\\U0001F91F\\U0001F928-\\U0001F92F"
1982 "\\U0001F931-\\U0001F932\\U0001F94C\\U0001F95F-\\U0001F96B\\U0001F992-\\U0001F997\\U0001F9D0-\\U0001F9E6\\U0001F90C-\\U0001F90F"
1983 "\\U0001F93F\\U0001F94D-\\U0001F94F\\U0001F96C-\\U0001F97F\\U0001F998-\\U0001F9BF\\U0001F9C1-\\U0001F9CF\\U0001F9E7-\\U0001F9FF"
1984 "\\U0001F6C6-\\U0001F6CA\\U0001F6D3-\\U0001F6D4\\U0001F6E6-\\U0001F6E8\\U0001F6EA\\U0001F6F1-\\U0001F6F2\\U0001F6F7-\\U0001F6F8"
1985 "\\U0001F6D5-\\U0001F6DF\\U0001F6ED-\\U0001F6EF\\U0001F6F9-\\U0001F6FF"
1986 "]";
1987
73c04bcf
A
1988//------------------------------------------------------------------------------------------
1989//
1990// class RBBICharMonkey Character (Grapheme Cluster) specific implementation
1991// of RBBIMonkeyKind.
1992//
1993//------------------------------------------------------------------------------------------
1994class RBBICharMonkey: public RBBIMonkeyKind {
1995public:
1996 RBBICharMonkey();
1997 virtual ~RBBICharMonkey();
1998 virtual UVector *charClasses();
1999 virtual void setText(const UnicodeString &s);
2000 virtual int32_t next(int32_t i);
2001private:
2002 UVector *fSets;
2003
2004 UnicodeSet *fCRLFSet;
2005 UnicodeSet *fControlSet;
2006 UnicodeSet *fExtendSet;
f3c0d7a5 2007 UnicodeSet *fZWJSet;
51004dcb 2008 UnicodeSet *fRegionalIndicatorSet;
46f4442e
A
2009 UnicodeSet *fPrependSet;
2010 UnicodeSet *fSpacingSet;
2011 UnicodeSet *fLSet;
2012 UnicodeSet *fVSet;
2013 UnicodeSet *fTSet;
2014 UnicodeSet *fLVSet;
2015 UnicodeSet *fLVTSet;
73c04bcf 2016 UnicodeSet *fHangulSet;
2ca993e8 2017 UnicodeSet *fEmojiBaseSet;
f3c0d7a5
A
2018 UnicodeSet *fEmojiModifierSet;
2019 UnicodeSet *fExtendedPictSet;
2020 UnicodeSet *fEBGSet;
2021 UnicodeSet *fEmojiNRKSet;
2022 UnicodeSet *fAnySet;
73c04bcf 2023
73c04bcf
A
2024 const UnicodeString *fText;
2025};
2026
2027
2028RBBICharMonkey::RBBICharMonkey() {
2029 UErrorCode status = U_ZERO_ERROR;
2030
2031 fText = NULL;
73c04bcf 2032
46f4442e 2033 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
f3c0d7a5
A
2034 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
2035 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
2036 fZWJSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
2037 fRegionalIndicatorSet =
2038 new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
46f4442e
A
2039 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
2040 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
2041 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
2042 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
2043 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
2044 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
2045 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
2046 fHangulSet = new UnicodeSet();
2047 fHangulSet->addAll(*fLSet);
2048 fHangulSet->addAll(*fVSet);
2049 fHangulSet->addAll(*fTSet);
2050 fHangulSet->addAll(*fLVSet);
2051 fHangulSet->addAll(*fLVTSet);
2ca993e8 2052
f3c0d7a5
A
2053 fEmojiBaseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = EB}] \\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F91D\\U0001F93C]"), status);
2054 fEmojiModifierSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EM}]"), status);
2055 fExtendedPictSet = new UnicodeSet(gExtended_Pict, status);
2056 fEBGSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EBG}]"), status);
2057 fEmojiNRKSet = new UnicodeSet(UNICODE_STRING_SIMPLE(
2058 "[[\\p{Emoji}]-[\\p{Grapheme_Cluster_Break = Regional_Indicator}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
2059 fAnySet = new UnicodeSet(0, 0x10ffff);
2ca993e8 2060
f3c0d7a5 2061 fSets = new UVector(status);
73c04bcf
A
2062 fSets->addElement(fCRLFSet, status);
2063 fSets->addElement(fControlSet, status);
2064 fSets->addElement(fExtendSet, status);
51004dcb 2065 fSets->addElement(fRegionalIndicatorSet, status);
4388f060
A
2066 if (!fPrependSet->isEmpty()) {
2067 fSets->addElement(fPrependSet, status);
2068 }
46f4442e 2069 fSets->addElement(fSpacingSet, status);
73c04bcf
A
2070 fSets->addElement(fHangulSet, status);
2071 fSets->addElement(fAnySet, status);
2ca993e8
A
2072 fSets->addElement(fEmojiBaseSet, status);
2073 fSets->addElement(fEmojiModifierSet, status);
2074 fSets->addElement(fZWJSet, status);
f3c0d7a5
A
2075 fSets->addElement(fExtendedPictSet, status);
2076 fSets->addElement(fEBGSet, status);
2077 fSets->addElement(fEmojiNRKSet,status);
73c04bcf
A
2078 if (U_FAILURE(status)) {
2079 deferredStatus = status;
2080 }
2081}
2082
2083
2084void RBBICharMonkey::setText(const UnicodeString &s) {
2085 fText = &s;
73c04bcf
A
2086}
2087
2088
73c04bcf 2089
46f4442e
A
2090int32_t RBBICharMonkey::next(int32_t prevPos) {
2091 int p0, p1, p2, p3; // Indices of the significant code points around the
2092 // break position being tested. The candidate break
2093 // location is before p2.
2094
2095 int breakPos = -1;
2096
2097 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2ca993e8
A
2098 UChar32 cBase; // for (X Extend*) patterns, the X character.
2099
46f4442e
A
2100 if (U_FAILURE(deferredStatus)) {
2101 return -1;
73c04bcf 2102 }
46f4442e
A
2103
2104 // Previous break at end of string. return DONE.
2105 if (prevPos >= fText->length()) {
2106 return -1;
73c04bcf 2107 }
46f4442e
A
2108 p0 = p1 = p2 = p3 = prevPos;
2109 c3 = fText->char32At(prevPos);
2ca993e8 2110 c0 = c1 = c2 = cBase = 0;
57a6839d
A
2111 (void)p0; // suppress set but not used warning.
2112 (void)c0;
46f4442e
A
2113
2114 // Loop runs once per "significant" character position in the input text.
2115 for (;;) {
2116 // Move all of the positions forward in the input string.
2117 p0 = p1; c0 = c1;
2118 p1 = p2; c1 = c2;
2119 p2 = p3; c2 = c3;
2120
2121 // Advancd p3 by one codepoint
2122 p3 = fText->moveIndex32(p3, 1);
2123 c3 = fText->char32At(p3);
2124
2125 if (p1 == p2) {
2126 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2127 continue;
2128 }
2129 if (p2 == fText->length()) {
2130 // Reached end of string. Always a break position.
2131 break;
2132 }
2133
2134 // Rule GB3 CR x LF
2135 // No Extend or Format characters may appear between the CR and LF,
2136 // which requires the additional check for p2 immediately following p1.
2137 //
2138 if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
2139 continue;
2140 }
2141
2142 // Rule (GB4). ( Control | CR | LF ) <break>
2143 if (fControlSet->contains(c1) ||
2144 c1 == 0x0D ||
2145 c1 == 0x0A) {
2146 break;
2147 }
2148
2149 // Rule (GB5) <break> ( Control | CR | LF )
2150 //
2151 if (fControlSet->contains(c2) ||
2152 c2 == 0x0D ||
2153 c2 == 0x0A) {
2154 break;
2155 }
2156
2157
2158 // Rule (GB6) L x ( L | V | LV | LVT )
2159 if (fLSet->contains(c1) &&
2160 (fLSet->contains(c2) ||
2161 fVSet->contains(c2) ||
2162 fLVSet->contains(c2) ||
2163 fLVTSet->contains(c2))) {
2164 continue;
2165 }
2166
2167 // Rule (GB7) ( LV | V ) x ( V | T )
2168 if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
2169 (fVSet->contains(c2) || fTSet->contains(c2))) {
2170 continue;
2171 }
2172
2173 // Rule (GB8) ( LVT | T) x T
2174 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
2175 fTSet->contains(c2)) {
2176 continue;
2177 }
2178
2ca993e8
A
2179 // Rule (GB9) x (Extend | ZWJ)
2180 if (fExtendSet->contains(c2) || fZWJSet->contains(c2)) {
2181 if (!fExtendSet->contains(c1)) {
2182 cBase = c1;
2183 }
46f4442e
A
2184 continue;
2185 }
2186
2187 // Rule (GB9a) x SpacingMark
2188 if (fSpacingSet->contains(c2)) {
2189 continue;
2190 }
2191
2192 // Rule (GB9b) Prepend x
2193 if (fPrependSet->contains(c1)) {
2194 continue;
2195 }
2196
f3c0d7a5
A
2197 // Rule (GB10) (Emoji_Base | EBG) Extend * x Emoji_Modifier
2198 if ((fEmojiBaseSet->contains(c1) || fEBGSet->contains(c1)) && fEmojiModifierSet->contains(c2)) {
2ca993e8
A
2199 continue;
2200 }
f3c0d7a5 2201 if ((fEmojiBaseSet->contains(cBase) || fEBGSet->contains(cBase)) &&
2ca993e8
A
2202 fExtendSet->contains(c1) && fEmojiModifierSet->contains(c2)) {
2203 continue;
2204 }
2205
f3c0d7a5
A
2206 // Rule (GB11) (Glue_After_ZWJ | Emoji) Extend * ZWJ x (Glue_After_ZWJ | Emoji)
2207 if ((fExtendedPictSet->contains(c0) || fEmojiNRKSet->contains(c0)) && fZWJSet->contains(c1) &&
2208 (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) {
2209 continue;
2210 }
2211 if ((fExtendedPictSet->contains(cBase) || fEmojiNRKSet->contains(cBase)) && fExtendSet->contains(c0) && fZWJSet->contains(c1) &&
2212 (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) {
2ca993e8
A
2213 continue;
2214 }
2215
2216 // Rule (GB12-13) Regional_Indicator x Regional_Indicator
2217 // Note: The first if condition is a little tricky. We only need to force
2218 // a break if there are three or more contiguous RIs. If there are
2219 // only two, a break following will occur via other rules, and will include
2220 // any trailing extend characters, which is needed behavior.
2221 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
2222 && fRegionalIndicatorSet->contains(c2)) {
2223 break;
2224 }
2225 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2226 continue;
2227 }
2228
2229 // Rule (GB999) Any <break> Any
46f4442e
A
2230 break;
2231 }
2232
2233 breakPos = p2;
2234 return breakPos;
73c04bcf
A
2235}
2236
2237
46f4442e 2238
73c04bcf
A
2239UVector *RBBICharMonkey::charClasses() {
2240 return fSets;
2241}
2242
2243
2244RBBICharMonkey::~RBBICharMonkey() {
2245 delete fSets;
2246 delete fCRLFSet;
2247 delete fControlSet;
2248 delete fExtendSet;
51004dcb 2249 delete fRegionalIndicatorSet;
46f4442e
A
2250 delete fPrependSet;
2251 delete fSpacingSet;
2252 delete fLSet;
2253 delete fVSet;
2254 delete fTSet;
2255 delete fLVSet;
2256 delete fLVTSet;
73c04bcf
A
2257 delete fHangulSet;
2258 delete fAnySet;
2ca993e8
A
2259 delete fEmojiBaseSet;
2260 delete fEmojiModifierSet;
2261 delete fZWJSet;
f3c0d7a5
A
2262 delete fExtendedPictSet;
2263 delete fEBGSet;
2264 delete fEmojiNRKSet;
73c04bcf
A
2265}
2266
2267//------------------------------------------------------------------------------------------
2268//
2269// class RBBIWordMonkey Word Break specific implementation
2270// of RBBIMonkeyKind.
2271//
2272//------------------------------------------------------------------------------------------
2273class RBBIWordMonkey: public RBBIMonkeyKind {
2274public:
2275 RBBIWordMonkey();
2276 virtual ~RBBIWordMonkey();
2277 virtual UVector *charClasses();
2278 virtual void setText(const UnicodeString &s);
2279 virtual int32_t next(int32_t i);
2280private:
2281 UVector *fSets;
2282
46f4442e
A
2283 UnicodeSet *fCRSet;
2284 UnicodeSet *fLFSet;
2285 UnicodeSet *fNewlineSet;
57a6839d 2286 UnicodeSet *fRegionalIndicatorSet;
73c04bcf 2287 UnicodeSet *fKatakanaSet;
57a6839d 2288 UnicodeSet *fHebrew_LetterSet;
73c04bcf 2289 UnicodeSet *fALetterSet;
57a6839d
A
2290 UnicodeSet *fSingle_QuoteSet;
2291 UnicodeSet *fDouble_QuoteSet;
46f4442e 2292 UnicodeSet *fMidNumLetSet;
73c04bcf
A
2293 UnicodeSet *fMidLetterSet;
2294 UnicodeSet *fMidNumSet;
2295 UnicodeSet *fNumericSet;
2296 UnicodeSet *fFormatSet;
2297 UnicodeSet *fOtherSet;
2298 UnicodeSet *fExtendSet;
2299 UnicodeSet *fExtendNumLetSet;
f3c0d7a5 2300 UnicodeSet *fDictionarySet;
2ca993e8 2301 UnicodeSet *fEBaseSet;
f3c0d7a5 2302 UnicodeSet *fEBGSet;
2ca993e8 2303 UnicodeSet *fEModifierSet;
f3c0d7a5
A
2304 UnicodeSet *fZWJSet;
2305 UnicodeSet *fExtendedPictSet;
2306 UnicodeSet *fEmojiNRKSet;
73c04bcf 2307
73c04bcf
A
2308 const UnicodeString *fText;
2309};
2310
2311
46f4442e 2312RBBIWordMonkey::RBBIWordMonkey()
73c04bcf
A
2313{
2314 UErrorCode status = U_ZERO_ERROR;
2315
73c04bcf
A
2316 fSets = new UVector(status);
2317
f3c0d7a5
A
2318 fCRSet = new UnicodeSet(u"[\\p{Word_Break = CR}]", status);
2319 fLFSet = new UnicodeSet(u"[\\p{Word_Break = LF}]", status);
2320 fNewlineSet = new UnicodeSet(u"[\\p{Word_Break = Newline}]", status);
2321 fKatakanaSet = new UnicodeSet(u"[\\p{Word_Break = Katakana}]", status);
2322 fRegionalIndicatorSet = new UnicodeSet(u"[\\p{Word_Break = Regional_Indicator}]", status);
2323 fHebrew_LetterSet = new UnicodeSet(u"[\\p{Word_Break = Hebrew_Letter}]", status);
2324 fALetterSet = new UnicodeSet(u"[\\p{Word_Break = ALetter}]", status);
2325 fSingle_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Single_Quote}]", status);
2326 fDouble_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Double_Quote}]", status);
2327 fMidNumLetSet = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]", status);
2328 fMidLetterSet = new UnicodeSet(u"[\\p{Word_Break = MidLetter} - [\\:]]", status);
2329 fMidNumSet = new UnicodeSet(u"[\\p{Word_Break = MidNum}]", status);
2330 fNumericSet = new UnicodeSet(u"[\\p{Word_Break = Numeric}]", status);
2331 fFormatSet = new UnicodeSet(u"[\\p{Word_Break = Format}]", status);
2332 fExtendNumLetSet = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
2333 fExtendSet = new UnicodeSet(u"[\\p{Word_Break = Extend}]", status);
2334
2335 fEBaseSet = new UnicodeSet(u"[[\\p{Word_Break = EB}] \\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F91D\\U0001F93C]", status);
2336 fEBGSet = new UnicodeSet(u"[\\p{Word_Break = EBG}]", status);
2337 fEModifierSet = new UnicodeSet(u"[\\p{Word_Break = EM}]", status);
2338 fZWJSet = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]", status);
2339 fExtendedPictSet = new UnicodeSet(gExtended_Pict, status);
2340 fEmojiNRKSet = new UnicodeSet(
2341 u"[[\\p{Emoji}]-[\\p{Word_Break = Regional_Indicator}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]", status);
2342
2343 fDictionarySet = new UnicodeSet(u"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
2344 fDictionarySet->addAll(*fKatakanaSet);
2345 fDictionarySet->addAll(UnicodeSet(u"[\\p{LineBreak = Complex_Context}]", status));
2346
2347 fALetterSet->removeAll(*fDictionarySet);
2ca993e8 2348
73c04bcf
A
2349 fOtherSet = new UnicodeSet();
2350 if(U_FAILURE(status)) {
f3c0d7a5
A
2351 IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
2352 deferredStatus = status;
2353 return;
73c04bcf
A
2354 }
2355
2356 fOtherSet->complement();
46f4442e
A
2357 fOtherSet->removeAll(*fCRSet);
2358 fOtherSet->removeAll(*fLFSet);
2359 fOtherSet->removeAll(*fNewlineSet);
73c04bcf 2360 fOtherSet->removeAll(*fKatakanaSet);
57a6839d 2361 fOtherSet->removeAll(*fHebrew_LetterSet);
73c04bcf 2362 fOtherSet->removeAll(*fALetterSet);
57a6839d
A
2363 fOtherSet->removeAll(*fSingle_QuoteSet);
2364 fOtherSet->removeAll(*fDouble_QuoteSet);
73c04bcf
A
2365 fOtherSet->removeAll(*fMidLetterSet);
2366 fOtherSet->removeAll(*fMidNumSet);
2367 fOtherSet->removeAll(*fNumericSet);
2368 fOtherSet->removeAll(*fExtendNumLetSet);
2369 fOtherSet->removeAll(*fFormatSet);
2370 fOtherSet->removeAll(*fExtendSet);
51004dcb 2371 fOtherSet->removeAll(*fRegionalIndicatorSet);
2ca993e8 2372 fOtherSet->removeAll(*fEBaseSet);
f3c0d7a5 2373 fOtherSet->removeAll(*fEBGSet);
2ca993e8 2374 fOtherSet->removeAll(*fEModifierSet);
f3c0d7a5
A
2375 fOtherSet->removeAll(*fZWJSet);
2376 fOtherSet->removeAll(*fExtendedPictSet);
2377 fOtherSet->removeAll(*fEmojiNRKSet);
2378
46f4442e 2379 // Inhibit dictionary characters from being tested at all.
f3c0d7a5 2380 fOtherSet->removeAll(*fDictionarySet);
73c04bcf 2381
57a6839d
A
2382 fSets->addElement(fCRSet, status);
2383 fSets->addElement(fLFSet, status);
2384 fSets->addElement(fNewlineSet, status);
51004dcb 2385 fSets->addElement(fRegionalIndicatorSet, status);
57a6839d
A
2386 fSets->addElement(fHebrew_LetterSet, status);
2387 fSets->addElement(fALetterSet, status);
2388 fSets->addElement(fSingle_QuoteSet, status);
2389 fSets->addElement(fDouble_QuoteSet, status);
f3c0d7a5
A
2390 //fSets->addElement(fKatakanaSet, status); // Omit Katakana from fSets, which omits Katakana characters
2391 // from the test data. They are all in the dictionary set,
2392 // which this (old, to be retired) monkey test cannot handle.
57a6839d
A
2393 fSets->addElement(fMidLetterSet, status);
2394 fSets->addElement(fMidNumLetSet, status);
2395 fSets->addElement(fMidNumSet, status);
2396 fSets->addElement(fNumericSet, status);
2397 fSets->addElement(fFormatSet, status);
2398 fSets->addElement(fExtendSet, status);
2399 fSets->addElement(fOtherSet, status);
2400 fSets->addElement(fExtendNumLetSet, status);
73c04bcf 2401
2ca993e8 2402 fSets->addElement(fEBaseSet, status);
f3c0d7a5 2403 fSets->addElement(fEBGSet, status);
2ca993e8 2404 fSets->addElement(fEModifierSet, status);
f3c0d7a5
A
2405 fSets->addElement(fZWJSet, status);
2406 fSets->addElement(fExtendedPictSet, status);
2407 fSets->addElement(fEmojiNRKSet, status);
2ca993e8 2408
73c04bcf
A
2409 if (U_FAILURE(status)) {
2410 deferredStatus = status;
2411 }
2412}
2413
2414void RBBIWordMonkey::setText(const UnicodeString &s) {
2415 fText = &s;
2416}
2417
2418
2419int32_t RBBIWordMonkey::next(int32_t prevPos) {
2420 int p0, p1, p2, p3; // Indices of the significant code points around the
2421 // break position being tested. The candidate break
2422 // location is before p2.
2423
2424 int breakPos = -1;
2425
2426 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2ca993e8 2427
46f4442e
A
2428 if (U_FAILURE(deferredStatus)) {
2429 return -1;
2430 }
73c04bcf
A
2431
2432 // Prev break at end of string. return DONE.
2433 if (prevPos >= fText->length()) {
2434 return -1;
2435 }
2436 p0 = p1 = p2 = p3 = prevPos;
2437 c3 = fText->char32At(prevPos);
2438 c0 = c1 = c2 = 0;
57a6839d 2439 (void)p0; // Suppress set but not used warning.
73c04bcf
A
2440
2441 // Loop runs once per "significant" character position in the input text.
2442 for (;;) {
2443 // Move all of the positions forward in the input string.
2444 p0 = p1; c0 = c1;
2445 p1 = p2; c1 = c2;
2446 p2 = p3; c2 = c3;
2447
2448 // Advancd p3 by X(Extend | Format)* Rule 4
46f4442e 2449 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
73c04bcf
A
2450 do {
2451 p3 = fText->moveIndex32(p3, 1);
2452 c3 = fText->char32At(p3);
46f4442e
A
2453 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2454 break;
2455 };
73c04bcf 2456 }
f3c0d7a5 2457 while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
73c04bcf
A
2458
2459
2460 if (p1 == p2) {
2461 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2462 continue;
2463 }
2464 if (p2 == fText->length()) {
2465 // Reached end of string. Always a break position.
2466 break;
2467 }
46f4442e 2468
73c04bcf
A
2469 // Rule (3) CR x LF
2470 // No Extend or Format characters may appear between the CR and LF,
2471 // which requires the additional check for p2 immediately following p1.
2472 //
46f4442e 2473 if (c1==0x0D && c2==0x0A) {
73c04bcf
A
2474 continue;
2475 }
2ca993e8 2476
46f4442e
A
2477 // Rule (3a) Break before and after newlines (including CR and LF)
2478 //
2479 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2480 break;
2481 };
2482 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2483 break;
2484 };
73c04bcf 2485
f3c0d7a5 2486 // Rule (3c) ZWJ x (Glue_after_ZWJ | EmojiNRK).
2ca993e8
A
2487 // Not ignoring extend chars, so peek into input text to
2488 // get the potential ZWJ, the character immediately preceding c2.
2489 // Sloppy UChar32 indexing: p2-1 may reference trail half
2490 // but char32At will get the full code point.
f3c0d7a5 2491 if (fZWJSet->contains(fText->char32At(p2-1)) && (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) {
2ca993e8
A
2492 continue;
2493 }
2494
57a6839d
A
2495 // Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
2496 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2497 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
73c04bcf
A
2498 continue;
2499 }
2500
57a6839d 2501 // Rule (6) (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
73c04bcf 2502 //
57a6839d
A
2503 if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2504 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2505 (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2506 continue;
2507 }
2508
2509 // Rule (7) (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter)
2510 if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2511 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2512 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
73c04bcf
A
2513 continue;
2514 }
2515
57a6839d
A
2516 // Rule (7a) Hebrew_Letter x Single_Quote
2517 if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2518 continue;
2519 }
73c04bcf 2520
57a6839d
A
2521 // Rule (7b) Hebrew_Letter x Double_Quote Hebrew_Letter
2522 if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2523 continue;
2524 }
2525
2526 // Rule (7c) Hebrew_Letter Double_Quote x Hebrew_Letter
2527 if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
73c04bcf
A
2528 continue;
2529 }
2530
2531 // Rule (8) Numeric x Numeric
2532 if (fNumericSet->contains(c1) &&
2533 fNumericSet->contains(c2)) {
2534 continue;
2535 }
2536
57a6839d
A
2537 // Rule (9) (ALetter | Hebrew_Letter) x Numeric
2538 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
73c04bcf
A
2539 fNumericSet->contains(c2)) {
2540 continue;
2541 }
2542
57a6839d 2543 // Rule (10) Numeric x (ALetter | Hebrew_Letter)
73c04bcf 2544 if (fNumericSet->contains(c1) &&
57a6839d 2545 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
73c04bcf
A
2546 continue;
2547 }
2548
57a6839d 2549 // Rule (11) Numeric (MidNum | MidNumLet | Single_Quote) x Numeric
46f4442e 2550 if (fNumericSet->contains(c0) &&
57a6839d 2551 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
73c04bcf
A
2552 fNumericSet->contains(c2)) {
2553 continue;
2554 }
2555
57a6839d 2556 // Rule (12) Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
73c04bcf 2557 if (fNumericSet->contains(c1) &&
57a6839d 2558 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
73c04bcf
A
2559 fNumericSet->contains(c3)) {
2560 continue;
2561 }
2562
2563 // Rule (13) Katakana x Katakana
f3c0d7a5
A
2564 // Note: matches UAX 29 rules, but doesn't come into play for ICU because
2565 // all Katakana are handled by the dictionary breaker.
73c04bcf
A
2566 if (fKatakanaSet->contains(c1) &&
2567 fKatakanaSet->contains(c2)) {
2568 continue;
2569 }
2570
57a6839d
A
2571 // Rule 13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
2572 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
73c04bcf
A
2573 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2574 fExtendNumLetSet->contains(c2)) {
2575 continue;
51004dcb 2576 }
73c04bcf 2577
57a6839d 2578 // Rule 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
73c04bcf 2579 if (fExtendNumLetSet->contains(c1) &&
57a6839d
A
2580 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2581 fNumericSet->contains(c2) || fKatakanaSet->contains(c2))) {
2582 continue;
51004dcb
A
2583 }
2584
f3c0d7a5
A
2585 // WB 14 (E_Base | EBG) x E_Modifier
2586 if ((fEBaseSet->contains(c1) || fEBGSet->contains(c1)) && fEModifierSet->contains(c2)) {
2587 continue;
2588 }
2589
2590 // Rule 15 - 17 Group pairs of Regional Indicators.
2ca993e8
A
2591 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
2592 break;
2593 }
51004dcb
A
2594 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2595 continue;
2596 }
73c04bcf 2597
f3c0d7a5 2598 // Rule 999. Break found here.
73c04bcf
A
2599 break;
2600 }
2601
2602 breakPos = p2;
2603 return breakPos;
2604}
2605
2606
2607UVector *RBBIWordMonkey::charClasses() {
2608 return fSets;
2609}
2610
2611
2612RBBIWordMonkey::~RBBIWordMonkey() {
2613 delete fSets;
46f4442e
A
2614 delete fCRSet;
2615 delete fLFSet;
2616 delete fNewlineSet;
73c04bcf 2617 delete fKatakanaSet;
57a6839d 2618 delete fHebrew_LetterSet;
73c04bcf 2619 delete fALetterSet;
57a6839d
A
2620 delete fSingle_QuoteSet;
2621 delete fDouble_QuoteSet;
46f4442e 2622 delete fMidNumLetSet;
73c04bcf
A
2623 delete fMidLetterSet;
2624 delete fMidNumSet;
2625 delete fNumericSet;
2626 delete fFormatSet;
2627 delete fExtendSet;
2628 delete fExtendNumLetSet;
51004dcb 2629 delete fRegionalIndicatorSet;
f3c0d7a5 2630 delete fDictionarySet;
73c04bcf 2631 delete fOtherSet;
2ca993e8 2632 delete fEBaseSet;
f3c0d7a5 2633 delete fEBGSet;
2ca993e8 2634 delete fEModifierSet;
f3c0d7a5
A
2635 delete fZWJSet;
2636 delete fExtendedPictSet;
2637 delete fEmojiNRKSet;
73c04bcf
A
2638}
2639
2640
2641
2642
2643//------------------------------------------------------------------------------------------
2644//
2645// class RBBISentMonkey Sentence Break specific implementation
2646// of RBBIMonkeyKind.
2647//
2648//------------------------------------------------------------------------------------------
2649class RBBISentMonkey: public RBBIMonkeyKind {
2650public:
2651 RBBISentMonkey();
2652 virtual ~RBBISentMonkey();
2653 virtual UVector *charClasses();
2654 virtual void setText(const UnicodeString &s);
2655 virtual int32_t next(int32_t i);
2656private:
2657 int moveBack(int posFrom);
2658 int moveForward(int posFrom);
2659 UChar32 cAt(int pos);
2660
2661 UVector *fSets;
2662
2663 UnicodeSet *fSepSet;
2664 UnicodeSet *fFormatSet;
2665 UnicodeSet *fSpSet;
2666 UnicodeSet *fLowerSet;
2667 UnicodeSet *fUpperSet;
2668 UnicodeSet *fOLetterSet;
2669 UnicodeSet *fNumericSet;
2670 UnicodeSet *fATermSet;
46f4442e 2671 UnicodeSet *fSContinueSet;
73c04bcf
A
2672 UnicodeSet *fSTermSet;
2673 UnicodeSet *fCloseSet;
2674 UnicodeSet *fOtherSet;
2675 UnicodeSet *fExtendSet;
2676
2677 const UnicodeString *fText;
2678
2679};
2680
2681RBBISentMonkey::RBBISentMonkey()
2682{
2683 UErrorCode status = U_ZERO_ERROR;
2684
2685 fSets = new UVector(status);
2686
46f4442e
A
2687 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
2688 // set and made into character classes of their own. For the monkey impl,
2689 // they remain in SEP, since Sep always appears with CR and LF in the rules.
2690 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status);
2691 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status);
2692 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status);
2693 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status);
2694 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status);
2695 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status);
2696 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status);
2697 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status);
2698 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2699 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status);
2700 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status);
2701 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status);
73c04bcf
A
2702 fOtherSet = new UnicodeSet();
2703
2704 if(U_FAILURE(status)) {
2705 deferredStatus = status;
2706 return;
2707 }
2708
2709 fOtherSet->complement();
2710 fOtherSet->removeAll(*fSepSet);
2711 fOtherSet->removeAll(*fFormatSet);
2712 fOtherSet->removeAll(*fSpSet);
2713 fOtherSet->removeAll(*fLowerSet);
2714 fOtherSet->removeAll(*fUpperSet);
2715 fOtherSet->removeAll(*fOLetterSet);
2716 fOtherSet->removeAll(*fNumericSet);
2717 fOtherSet->removeAll(*fATermSet);
46f4442e 2718 fOtherSet->removeAll(*fSContinueSet);
73c04bcf
A
2719 fOtherSet->removeAll(*fSTermSet);
2720 fOtherSet->removeAll(*fCloseSet);
2721 fOtherSet->removeAll(*fExtendSet);
2722
46f4442e
A
2723 fSets->addElement(fSepSet, status);
2724 fSets->addElement(fFormatSet, status);
2725 fSets->addElement(fSpSet, status);
2726 fSets->addElement(fLowerSet, status);
2727 fSets->addElement(fUpperSet, status);
2728 fSets->addElement(fOLetterSet, status);
2729 fSets->addElement(fNumericSet, status);
2730 fSets->addElement(fATermSet, status);
2731 fSets->addElement(fSContinueSet, status);
2732 fSets->addElement(fSTermSet, status);
2733 fSets->addElement(fCloseSet, status);
2734 fSets->addElement(fOtherSet, status);
2735 fSets->addElement(fExtendSet, status);
73c04bcf
A
2736
2737 if (U_FAILURE(status)) {
2738 deferredStatus = status;
2739 }
2740}
2741
2742
2743
2744void RBBISentMonkey::setText(const UnicodeString &s) {
2745 fText = &s;
2746}
2747
2748UVector *RBBISentMonkey::charClasses() {
2749 return fSets;
2750}
2751
2752
2753// moveBack() Find the "significant" code point preceding the index i.
2754// Skips over ($Extend | $Format)* .
46f4442e 2755//
73c04bcf
A
2756int RBBISentMonkey::moveBack(int i) {
2757 if (i <= 0) {
2758 return -1;
2759 }
2760 UChar32 c;
2761 int32_t j = i;
2762 do {
2763 j = fText->moveIndex32(j, -1);
2764 c = fText->char32At(j);
2765 }
2766 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2767 return j;
2768
2769 }
2770
2771
2772int RBBISentMonkey::moveForward(int i) {
2773 if (i>=fText->length()) {
2774 return fText->length();
2775 }
2776 UChar32 c;
2777 int32_t j = i;
2778 do {
2779 j = fText->moveIndex32(j, 1);
2780 c = cAt(j);
2781 }
2782 while (fFormatSet->contains(c) || fExtendSet->contains(c));
2783 return j;
2784}
2785
2786UChar32 RBBISentMonkey::cAt(int pos) {
2787 if (pos<0 || pos>=fText->length()) {
2788 return -1;
2789 } else {
2790 return fText->char32At(pos);
2791 }
2792}
2793
2794int32_t RBBISentMonkey::next(int32_t prevPos) {
2795 int p0, p1, p2, p3; // Indices of the significant code points around the
2796 // break position being tested. The candidate break
2797 // location is before p2.
2798
2799 int breakPos = -1;
2800
2801 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2802 UChar32 c;
2803
46f4442e
A
2804 if (U_FAILURE(deferredStatus)) {
2805 return -1;
2806 }
2807
73c04bcf
A
2808 // Prev break at end of string. return DONE.
2809 if (prevPos >= fText->length()) {
2810 return -1;
2811 }
2812 p0 = p1 = p2 = p3 = prevPos;
2813 c3 = fText->char32At(prevPos);
2814 c0 = c1 = c2 = 0;
57a6839d 2815 (void)p0; // Suppress set but not used warning.
73c04bcf
A
2816
2817 // Loop runs once per "significant" character position in the input text.
2818 for (;;) {
2819 // Move all of the positions forward in the input string.
2820 p0 = p1; c0 = c1;
2821 p1 = p2; c1 = c2;
2822 p2 = p3; c2 = c3;
46f4442e 2823
73c04bcf
A
2824 // Advancd p3 by X(Extend | Format)* Rule 4
2825 p3 = moveForward(p3);
2826 c3 = cAt(p3);
2827
2828 // Rule (3) CR x LF
2829 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2830 continue;
2831 }
46f4442e 2832
73c04bcf
A
2833 // Rule (4). Sep <break>
2834 if (fSepSet->contains(c1)) {
2835 p2 = p1+1; // Separators don't combine with Extend or Format.
2836 break;
2837 }
2838
2839 if (p2 >= fText->length()) {
2840 // Reached end of string. Always a break position.
2841 break;
2842 }
2843
2844 if (p2 == prevPos) {
2845 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2846 continue;
2847 }
46f4442e 2848
73c04bcf
A
2849 // Rule (6). ATerm x Numeric
2850 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) {
2851 continue;
2852 }
2853
2ca993e8
A
2854 // Rule (7). (Upper | Lower) ATerm x Uppper
2855 if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
2856 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
73c04bcf
A
2857 continue;
2858 }
2859
2860 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2861 // Note: STerm | ATerm are added to the negated part of the expression by a
2862 // note to the Unicode 5.0 documents.
2863 int p8 = p1;
2864 while (fSpSet->contains(cAt(p8))) {
2865 p8 = moveBack(p8);
2866 }
2867 while (fCloseSet->contains(cAt(p8))) {
2868 p8 = moveBack(p8);
2869 }
2870 if (fATermSet->contains(cAt(p8))) {
2871 p8=p2;
2872 for (;;) {
2873 c = cAt(p8);
2874 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2875 fLowerSet->contains(c) || fSepSet->contains(c) ||
2876 fATermSet->contains(c) || fSTermSet->contains(c)) {
2877 break;
2878 }
2879 p8 = moveForward(p8);
2880 }
2881 if (fLowerSet->contains(cAt(p8))) {
2882 continue;
2883 }
2884 }
46f4442e
A
2885
2886 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2887 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
73c04bcf
A
2888 p8 = p1;
2889 while (fSpSet->contains(cAt(p8))) {
2890 p8 = moveBack(p8);
2891 }
2892 while (fCloseSet->contains(cAt(p8))) {
2893 p8 = moveBack(p8);
2894 }
2895 c = cAt(p8);
2896 if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2897 continue;
2898 }
2899 }
2900
46f4442e 2901 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)
73c04bcf
A
2902 int p9 = p1;
2903 while (fCloseSet->contains(cAt(p9))) {
2904 p9 = moveBack(p9);
2905 }
2906 c = cAt(p9);
2907 if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2908 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2909 continue;
2910 }
2911 }
2912
46f4442e 2913 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)
73c04bcf
A
2914 int p10 = p1;
2915 while (fSpSet->contains(cAt(p10))) {
2916 p10 = moveBack(p10);
2917 }
2918 while (fCloseSet->contains(cAt(p10))) {
2919 p10 = moveBack(p10);
2920 }
2921 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2922 if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2923 continue;
2924 }
2925 }
2926
46f4442e 2927 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break>
73c04bcf 2928 int p11 = p1;
46f4442e
A
2929 if (fSepSet->contains(cAt(p11))) {
2930 p11 = moveBack(p11);
2931 }
73c04bcf
A
2932 while (fSpSet->contains(cAt(p11))) {
2933 p11 = moveBack(p11);
2934 }
2935 while (fCloseSet->contains(cAt(p11))) {
2936 p11 = moveBack(p11);
2937 }
2938 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2939 break;
2940 }
2941
2942 // Rule (12) Any x Any
2943 continue;
2944 }
2945 breakPos = p2;
2946 return breakPos;
2947}
2948
2949RBBISentMonkey::~RBBISentMonkey() {
2950 delete fSets;
2951 delete fSepSet;
2952 delete fFormatSet;
2953 delete fSpSet;
2954 delete fLowerSet;
2955 delete fUpperSet;
2956 delete fOLetterSet;
2957 delete fNumericSet;
2958 delete fATermSet;
46f4442e 2959 delete fSContinueSet;
73c04bcf
A
2960 delete fSTermSet;
2961 delete fCloseSet;
2962 delete fOtherSet;
2963 delete fExtendSet;
2964}
2965
2966
2967
2968//-------------------------------------------------------------------------------------------
2969//
2970// RBBILineMonkey
2971//
2972//-------------------------------------------------------------------------------------------
2973
2974class RBBILineMonkey: public RBBIMonkeyKind {
2975public:
2976 RBBILineMonkey();
2977 virtual ~RBBILineMonkey();
2978 virtual UVector *charClasses();
2979 virtual void setText(const UnicodeString &s);
2980 virtual int32_t next(int32_t i);
2981 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2982private:
2983 UVector *fSets;
2984
2985 UnicodeSet *fBK;
2986 UnicodeSet *fCR;
2987 UnicodeSet *fLF;
2988 UnicodeSet *fCM;
2989 UnicodeSet *fNL;
2990 UnicodeSet *fSG;
2991 UnicodeSet *fWJ;
2992 UnicodeSet *fZW;
2993 UnicodeSet *fGL;
2994 UnicodeSet *fCB;
2995 UnicodeSet *fSP;
2996 UnicodeSet *fB2;
2997 UnicodeSet *fBA;
2998 UnicodeSet *fBB;
2999 UnicodeSet *fHY;
3000 UnicodeSet *fH2;
3001 UnicodeSet *fH3;
3002 UnicodeSet *fCL;
729e4ab9 3003 UnicodeSet *fCP;
73c04bcf
A
3004 UnicodeSet *fEX;
3005 UnicodeSet *fIN;
3006 UnicodeSet *fJL;
3007 UnicodeSet *fJV;
3008 UnicodeSet *fJT;
3009 UnicodeSet *fNS;
3010 UnicodeSet *fOP;
3011 UnicodeSet *fQU;
3012 UnicodeSet *fIS;
3013 UnicodeSet *fNU;
3014 UnicodeSet *fPO;
3015 UnicodeSet *fPR;
3016 UnicodeSet *fSY;
3017 UnicodeSet *fAI;
3018 UnicodeSet *fAL;
4388f060
A
3019 UnicodeSet *fCJ;
3020 UnicodeSet *fHL;
73c04bcf 3021 UnicodeSet *fID;
51004dcb 3022 UnicodeSet *fRI;
73c04bcf 3023 UnicodeSet *fXX;
2ca993e8
A
3024 UnicodeSet *fEB;
3025 UnicodeSet *fEM;
3026 UnicodeSet *fZJ;
f3c0d7a5
A
3027 UnicodeSet *fExtendedPict;
3028 UnicodeSet *fEmojiNRK;
73c04bcf 3029
57a6839d 3030 BreakIterator *fCharBI;
73c04bcf 3031 const UnicodeString *fText;
73c04bcf 3032 RegexMatcher *fNumberMatcher;
73c04bcf
A
3033};
3034
2ca993e8
A
3035RBBILineMonkey::RBBILineMonkey() :
3036 RBBIMonkeyKind(),
3037 fSets(NULL),
3038
3039 fCharBI(NULL),
3040 fText(NULL),
3041 fNumberMatcher(NULL)
73c04bcf 3042
73c04bcf 3043{
2ca993e8
A
3044 if (U_FAILURE(deferredStatus)) {
3045 return;
3046 }
3047
73c04bcf
A
3048 UErrorCode status = U_ZERO_ERROR;
3049
3050 fSets = new UVector(status);
3051
46f4442e
A
3052 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
3053 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
3054 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
3055 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
3056 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
3057 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
3058 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
3059 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
3060 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
3061 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
3062 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
3063 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
3064 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
3065 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
3066 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
3067 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
3068 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
729e4ab9 3069 fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
46f4442e
A
3070 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
3071 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
3072 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
3073 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
3074 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
3075 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
3076 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
3077 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
3078 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
3079 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
3080 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
3081 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
3082 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
3083 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
3084 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
4388f060
A
3085 fCJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
3086 fHL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
46f4442e 3087 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
51004dcb 3088 fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
46f4442e
A
3089 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
3090 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
f3c0d7a5
A
3091 fEB = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=EB}] \\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F91D\\U0001F93C]"), status);
3092 fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
3093 fZJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
3094 fEmojiNRK = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Emoji}]-[\\p{Line_break=RI}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
3095 fExtendedPict = new UnicodeSet(gExtended_Pict, status);
73c04bcf
A
3096
3097 if (U_FAILURE(status)) {
3098 deferredStatus = status;
73c04bcf
A
3099 return;
3100 }
3101
3102 fAL->addAll(*fXX); // Default behavior for XX is identical to AL
3103 fAL->addAll(*fAI); // Default behavior for AI is identical to AL
73c04bcf
A
3104 fAL->addAll(*fSG); // Default behavior for SG is identical to AL.
3105
4388f060 3106 fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS.
f3c0d7a5 3107 fCM->addAll(*fZJ); // ZWJ behaves as a CM.
2ca993e8 3108
73c04bcf
A
3109 fSets->addElement(fBK, status);
3110 fSets->addElement(fCR, status);
3111 fSets->addElement(fLF, status);
3112 fSets->addElement(fCM, status);
3113 fSets->addElement(fNL, status);
3114 fSets->addElement(fWJ, status);
3115 fSets->addElement(fZW, status);
3116 fSets->addElement(fGL, status);
3117 fSets->addElement(fCB, status);
3118 fSets->addElement(fSP, status);
3119 fSets->addElement(fB2, status);
3120 fSets->addElement(fBA, status);
3121 fSets->addElement(fBB, status);
3122 fSets->addElement(fHY, status);
3123 fSets->addElement(fH2, status);
3124 fSets->addElement(fH3, status);
3125 fSets->addElement(fCL, status);
729e4ab9 3126 fSets->addElement(fCP, status);
73c04bcf
A
3127 fSets->addElement(fEX, status);
3128 fSets->addElement(fIN, status);
3129 fSets->addElement(fJL, status);
3130 fSets->addElement(fJT, status);
3131 fSets->addElement(fJV, status);
3132 fSets->addElement(fNS, status);
3133 fSets->addElement(fOP, status);
3134 fSets->addElement(fQU, status);
3135 fSets->addElement(fIS, status);
3136 fSets->addElement(fNU, status);
3137 fSets->addElement(fPO, status);
3138 fSets->addElement(fPR, status);
3139 fSets->addElement(fSY, status);
3140 fSets->addElement(fAI, status);
3141 fSets->addElement(fAL, status);
4388f060 3142 fSets->addElement(fHL, status);
73c04bcf
A
3143 fSets->addElement(fID, status);
3144 fSets->addElement(fWJ, status);
51004dcb 3145 fSets->addElement(fRI, status);
73c04bcf 3146 fSets->addElement(fSG, status);
2ca993e8
A
3147 fSets->addElement(fEB, status);
3148 fSets->addElement(fEM, status);
3149 fSets->addElement(fZJ, status);
f3c0d7a5
A
3150 fSets->addElement(fExtendedPict, status);
3151 fSets->addElement(fEmojiNRK, status);
3152
73c04bcf 3153
2ca993e8 3154 const char *rules =
f3c0d7a5
A
3155 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
3156 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
3157 "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
3158 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
3159 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
3160 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
46f4442e 3161
73c04bcf 3162 fNumberMatcher = new RegexMatcher(
46f4442e 3163 UnicodeString(rules, -1, US_INV), 0, status);
73c04bcf
A
3164
3165 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
3166
3167 if (U_FAILURE(status)) {
3168 deferredStatus = status;
3169 }
3170}
3171
3172
3173void RBBILineMonkey::setText(const UnicodeString &s) {
3174 fText = &s;
3175 fCharBI->setText(s);
3176 fNumberMatcher->reset(s);
3177}
3178
3179//
3180// rule9Adjust
3181// Line Break TR rules 9 and 10 implementation.
3182// This deals with combining marks and other sequences that
3183// that must be treated as if they were something other than what they actually are.
3184//
3185// This is factored out into a separate function because it must be applied twice for
3186// each potential break, once to the chars before the position being checked, then
3187// again to the text following the possible break.
3188//
3189void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
3190 if (pos == -1) {
3191 // Invalid initial position. Happens during the warmup iteration of the
3192 // main loop in next().
3193 return;
3194 }
3195
3196 int32_t nPos = *nextPos;
3197
3198 // LB 9 Keep combining sequences together.
3199 // advance over any CM class chars. Note that Line Break CM is different
3200 // from the normal Grapheme Extend property.
3201 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
3202 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
3203 for (;;) {
3204 *nextChar = fText->char32At(nPos);
3205 if (!fCM->contains(*nextChar)) {
3206 break;
3207 }
3208 nPos = fText->moveIndex32(nPos, 1);
3209 }
3210 }
3211
3212
3213 // LB 9 Treat X CM* as if it were x.
3214 // No explicit action required.
3215
3216 // LB 10 Treat any remaining combining mark as AL
3217 if (fCM->contains(*posChar)) {
f3c0d7a5 3218 *posChar = u'A';
73c04bcf
A
3219 }
3220
3221 // Push the updated nextPos and nextChar back to our caller.
3222 // This only makes a difference if posChar got bigger by consuming a
3223 // combining sequence.
3224 *nextPos = nPos;
3225 *nextChar = fText->char32At(nPos);
3226}
3227
3228
3229
3230int32_t RBBILineMonkey::next(int32_t startPos) {
3231 UErrorCode status = U_ZERO_ERROR;
3232 int32_t pos; // Index of the char following a potential break position
3233 UChar32 thisChar; // Character at above position "pos"
3234
3235 int32_t prevPos; // Index of the char preceding a potential break position
3236 UChar32 prevChar; // Character at above position. Note that prevChar
3237 // and thisChar may not be adjacent because combining
3238 // characters between them will be ignored.
3239
4388f060
A
3240 int32_t prevPosX2; // Second previous character. Wider context for LB21a.
3241 UChar32 prevCharX2;
3242
73c04bcf
A
3243 int32_t nextPos; // Index of the next character following pos.
3244 // Usually skips over combining marks.
3245 int32_t nextCPPos; // Index of the code point following "pos."
3246 // May point to a combining mark.
3247 int32_t tPos; // temp value.
3248 UChar32 c;
3249
46f4442e
A
3250 if (U_FAILURE(deferredStatus)) {
3251 return -1;
3252 }
3253
73c04bcf
A
3254 if (startPos >= fText->length()) {
3255 return -1;
3256 }
3257
3258
3259 // Initial values for loop. Loop will run the first time without finding breaks,
3260 // while the invalid values shift out and the "this" and
3261 // "prev" positions are filled in with good values.
4388f060
A
3262 pos = prevPos = prevPosX2 = -1; // Invalid value, serves as flag for initial loop iteration.
3263 thisChar = prevChar = prevCharX2 = 0;
73c04bcf
A
3264 nextPos = nextCPPos = startPos;
3265
3266
3267 // Loop runs once per position in the test text, until a break position
3268 // is found.
3269 for (;;) {
4388f060
A
3270 prevPosX2 = prevPos;
3271 prevCharX2 = prevChar;
3272
73c04bcf
A
3273 prevPos = pos;
3274 prevChar = thisChar;
3275
3276 pos = nextPos;
3277 thisChar = fText->char32At(pos);
3278
3279 nextCPPos = fText->moveIndex32(pos, 1);
3280 nextPos = nextCPPos;
3281
3282 // Rule LB2 - Break at end of text.
3283 if (pos >= fText->length()) {
3284 break;
3285 }
3286
3287 // Rule LB 9 - adjust for combining sequences.
3288 // We do this one out-of-order because the adjustment does not change anything
3289 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3290 // be applied.
3291 rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
3292 nextCPPos = nextPos = fText->moveIndex32(pos, 1);
3293 c = fText->char32At(nextPos);
3294 rule9Adjust(pos, &thisChar, &nextPos, &c);
3295
3296 // If the loop is still warming up - if we haven't shifted the initial
3297 // -1 positions out of prevPos yet - loop back to advance the
3298 // position in the input without any further looking for breaks.
3299 if (prevPos == -1) {
3300 continue;
3301 }
46f4442e 3302
73c04bcf
A
3303 // LB 4 Always break after hard line breaks,
3304 if (fBK->contains(prevChar)) {
3305 break;
3306 }
3307
3308 // LB 5 Break after CR, LF, NL, but not inside CR LF
3309 if (prevChar == 0x0d && thisChar == 0x0a) {
3310 continue;
3311 }
3312 if (prevChar == 0x0d ||
3313 prevChar == 0x0a ||
3314 prevChar == 0x85) {
3315 break;
3316 }
3317
3318 // LB 6 Don't break before hard line breaks
3319 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3320 fBK->contains(thisChar)) {
3321 continue;
3322 }
3323
3324
3325 // LB 7 Don't break before spaces or zero-width space.
3326 if (fSP->contains(thisChar)) {
3327 continue;
3328 }
3329
3330 if (fZW->contains(thisChar)) {
3331 continue;
3332 }
3333
3334 // LB 8 Break after zero width space
3335 if (fZW->contains(prevChar)) {
3336 break;
3337 }
3338
f3c0d7a5 3339 // LB 8a ZWJ x (ID | ExtendedPict | Emoji)
2ca993e8
A
3340 // The monkey test's way of ignoring combining characters doesn't work
3341 // for this rule. ZJ is also a CM. Need to get the actual character
3342 // preceding "thisChar", not ignoring combining marks, possibly ZJ.
3343 {
3344 int32_t prevIdx = fText->moveIndex32(pos, -1);
3345 UChar32 prevC = fText->char32At(prevIdx);
f3c0d7a5 3346 if (fZJ->contains(prevC) && (fID->contains(thisChar) || fExtendedPict->contains(thisChar) || fEmojiNRK->contains(thisChar))) {
2ca993e8
A
3347 continue;
3348 }
3349 }
3350
73c04bcf
A
3351 // LB 9, 10 Already done, at top of loop.
3352 //
3353
3354
3355 // LB 11 Do not break before or after WORD JOINER and related characters.
3356 // x WJ
3357 // WJ x
3358 //
3359 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3360 continue;
3361 }
3362
3363 // LB 12
73c04bcf 3364 // GL x
46f4442e 3365 if (fGL->contains(prevChar)) {
73c04bcf
A
3366 continue;
3367 }
2ca993e8 3368
46f4442e
A
3369 // LB 12a
3370 // [^SP BA HY] x GL
3371 if (!(fSP->contains(prevChar) ||
3372 fBA->contains(prevChar) ||
3373 fHY->contains(prevChar) ) && fGL->contains(thisChar)) {
3374 continue;
3375 }
3376
3377
73c04bcf
A
3378
3379 // LB 13 Don't break before closings.
729e4ab9 3380 // NU x CL, NU x CP and NU x IS are not matched here so that they will
73c04bcf
A
3381 // fall into LB 17 and the more general number regular expression.
3382 //
729e4ab9
A
3383 if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
3384 (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
3385 fEX->contains(thisChar) ||
3386 (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
3387 (!fNU->contains(prevChar) && fSY->contains(thisChar))) {
73c04bcf
A
3388 continue;
3389 }
3390
3391 // LB 14 Don't break after OP SP*
3392 // Scan backwards, checking for this sequence.
3393 // The OP char could include combining marks, so we actually check for
3394 // OP CM* SP*
3395 // Another Twist: The Rule 67 fixes may have changed a SP CM
3396 // sequence into a ID char, so before scanning back through spaces,
3397 // verify that prevChar is indeed a space. The prevChar variable
3398 // may differ from fText[prevPos]
3399 tPos = prevPos;
3400 if (fSP->contains(prevChar)) {
3401 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3402 tPos=fText->moveIndex32(tPos, -1);
3403 }
3404 }
3405 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3406 tPos=fText->moveIndex32(tPos, -1);
3407 }
3408 if (fOP->contains(fText->char32At(tPos))) {
3409 continue;
3410 }
3411
3412
3413 // LB 15 QU SP* x OP
3414 if (fOP->contains(thisChar)) {
3415 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3416 int tPos = prevPos;
3417 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3418 tPos = fText->moveIndex32(tPos, -1);
3419 }
3420 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3421 tPos = fText->moveIndex32(tPos, -1);
3422 }
3423 if (fQU->contains(fText->char32At(tPos))) {
3424 continue;
3425 }
3426 }
3427
3428
3429
729e4ab9
A
3430 // LB 16 (CL | CP) SP* x NS
3431 // Scan backwards for SP* CM* (CL | CP)
73c04bcf
A
3432 if (fNS->contains(thisChar)) {
3433 int tPos = prevPos;
3434 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3435 tPos = fText->moveIndex32(tPos, -1);
3436 }
3437 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3438 tPos = fText->moveIndex32(tPos, -1);
3439 }
729e4ab9 3440 if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
73c04bcf
A
3441 continue;
3442 }
3443 }
3444
3445
3446 // LB 17 B2 SP* x B2
3447 if (fB2->contains(thisChar)) {
3448 // Scan backwards, checking for the B2 CM* SP* sequence.
3449 tPos = prevPos;
3450 if (fSP->contains(prevChar)) {
3451 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3452 tPos=fText->moveIndex32(tPos, -1);
3453 }
3454 }
3455 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3456 tPos=fText->moveIndex32(tPos, -1);
3457 }
3458 if (fB2->contains(fText->char32At(tPos))) {
3459 continue;
3460 }
3461 }
3462
46f4442e 3463
73c04bcf
A
3464 // LB 18 break after space
3465 if (fSP->contains(prevChar)) {
3466 break;
3467 }
3468
3469 // LB 19
3470 // x QU
3471 // QU x
3472 if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3473 continue;
3474 }
3475
3476 // LB 20 Break around a CB
3477 if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3478 break;
3479 }
3480
3481 // LB 21
3482 if (fBA->contains(thisChar) ||
3483 fHY->contains(thisChar) ||
3484 fNS->contains(thisChar) ||
3485 fBB->contains(prevChar) ) {
3486 continue;
3487 }
3488
4388f060
A
3489 // LB 21a
3490 // HL (HY | BA) x
2ca993e8 3491 if (fHL->contains(prevCharX2) &&
4388f060
A
3492 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3493 continue;
3494 }
3495
57a6839d
A
3496 // LB 21b
3497 // SY x HL
51004dcb
A
3498 if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3499 continue;
3500 }
3501
73c04bcf 3502 // LB 22
729e4ab9 3503 if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
2ca993e8 3504 (fEX->contains(prevChar) && fIN->contains(thisChar)) ||
4388f060 3505 (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
f3c0d7a5 3506 ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) && fIN->contains(thisChar)) ||
729e4ab9
A
3507 (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
3508 (fNU->contains(prevChar) && fIN->contains(thisChar)) ) {
73c04bcf
A
3509 continue;
3510 }
3511
3512
f3c0d7a5
A
3513 // LB 23 (AL | HL) x NU
3514 // NU x (AL | HL)
3515 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
3516 continue;
3517 }
3518 if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3519 continue;
3520 }
3521
3522 // LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
3523 // PR x (ID | EB | EM)
3524 // (ID | EB | EM) x PO
3525 if (fPR->contains(prevChar) &&
3526 (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar))) {
3527 continue;
3528 }
3529 if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
3530 fPO->contains(thisChar)) {
73c04bcf
A
3531 continue;
3532 }
3533
3534 // LB 24 Do not break between prefix and letters or ideographs.
f3c0d7a5
A
3535 // (PR | PO) x (AL | HL)
3536 // (AL | HL) x (PR | PO)
3537 if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
3538 (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3539 continue;
3540 }
3541 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
3542 (fPR->contains(thisChar) || fPO->contains(thisChar))) {
73c04bcf
A
3543 continue;
3544 }
46f4442e
A
3545
3546
3547
73c04bcf
A
3548 // LB 25 Numbers
3549 if (fNumberMatcher->lookingAt(prevPos, status)) {
3550 if (U_FAILURE(status)) {
3551 break;
3552 }
3553 // Matched a number. But could have been just a single digit, which would
3554 // not represent a "no break here" between prevChar and thisChar
3555 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num
3556 if (numEndIdx > pos) {
3557 // Number match includes at least our two chars being checked
3558 if (numEndIdx > nextPos) {
3559 // Number match includes additional chars. Update pos and nextPos
3560 // so that next loop iteration will continue at the end of the number,
3561 // checking for breaks between last char in number & whatever follows.
3562 pos = nextPos = numEndIdx;
3563 do {
3564 pos = fText->moveIndex32(pos, -1);
3565 thisChar = fText->char32At(pos);
3566 } while (fCM->contains(thisChar));
3567 }
3568 continue;
3569 }
3570 }
3571
3572
3573 // LB 26 Do not break a Korean syllable.
3574 if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3575 fJV->contains(thisChar) ||
3576 fH2->contains(thisChar) ||
3577 fH3->contains(thisChar))) {
3578 continue;
3579 }
3580
3581 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) &&
3582 (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3583 continue;
3584 }
3585
3586 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3587 fJT->contains(thisChar)) {
3588 continue;
3589 }
3590
3591 // LB 27 Treat a Korean Syllable Block the same as ID.
3592 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3593 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3594 fIN->contains(thisChar)) {
3595 continue;
3596 }
3597 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3598 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3599 fPO->contains(thisChar)) {
3600 continue;
3601 }
3602 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3603 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3604 continue;
3605 }
3606
3607
3608
46f4442e 3609 // LB 28 Do not break between alphabetics ("at").
4388f060 3610 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
73c04bcf
A
3611 continue;
3612 }
3613
3614 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g.").
4388f060 3615 if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
73c04bcf
A
3616 continue;
3617 }
3618
729e4ab9
A
3619 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3620 // (AL | NU) x OP
3621 // CP x (AL | NU)
4388f060 3622 if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
729e4ab9
A
3623 continue;
3624 }
4388f060 3625 if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
729e4ab9
A
3626 continue;
3627 }
3628
2ca993e8
A
3629 // LB30a RI RI <break> RI
3630 // RI x RI
3631 if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
3632 break;
3633 }
51004dcb
A
3634 if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3635 continue;
3636 }
3637
2ca993e8
A
3638 // LB30b Emoji Base x Emoji Modifier
3639 if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
3640 continue;
3641 }
3642
73c04bcf
A
3643 // LB 31 Break everywhere else
3644 break;
3645
3646 }
3647
3648 return pos;
3649}
3650
3651
3652UVector *RBBILineMonkey::charClasses() {
3653 return fSets;
3654}
3655
3656
3657RBBILineMonkey::~RBBILineMonkey() {
3658 delete fSets;
3659
3660 delete fBK;
3661 delete fCR;
3662 delete fLF;
3663 delete fCM;
3664 delete fNL;
3665 delete fWJ;
3666 delete fZW;
3667 delete fGL;
3668 delete fCB;
3669 delete fSP;
3670 delete fB2;
3671 delete fBA;
3672 delete fBB;
3673 delete fHY;
3674 delete fH2;
3675 delete fH3;
3676 delete fCL;
729e4ab9 3677 delete fCP;
73c04bcf
A
3678 delete fEX;
3679 delete fIN;
3680 delete fJL;
3681 delete fJV;
3682 delete fJT;
3683 delete fNS;
3684 delete fOP;
3685 delete fQU;
3686 delete fIS;
3687 delete fNU;
3688 delete fPO;
3689 delete fPR;
3690 delete fSY;
3691 delete fAI;
3692 delete fAL;
4388f060
A
3693 delete fCJ;
3694 delete fHL;
73c04bcf 3695 delete fID;
51004dcb 3696 delete fRI;
73c04bcf
A
3697 delete fSG;
3698 delete fXX;
2ca993e8
A
3699 delete fEB;
3700 delete fEM;
3701 delete fZJ;
f3c0d7a5
A
3702 delete fExtendedPict;
3703 delete fEmojiNRK;
73c04bcf
A
3704
3705 delete fCharBI;
3706 delete fNumberMatcher;
3707}
3708
3709
3710//-------------------------------------------------------------------------------------------
3711//
3712// TestMonkey
3713//
3714// params
3715// seed=nnnnn Random number starting seed.
3716// Setting the seed allows errors to be reproduced.
3717// loop=nnn Looping count. Controls running time.
3718// -1: run forever.
3719// 0 or greater: run length.
3720//
3721// type = char | word | line | sent | title
3722//
2ca993e8
A
3723// Example:
3724// intltest rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3725//
73c04bcf
A
3726//-------------------------------------------------------------------------------------------
3727
3728static int32_t getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3729 int32_t val = defaultVal;
3730 name.append(" *= *(-?\\d+)");
3731 UErrorCode status = U_ZERO_ERROR;
3732 RegexMatcher m(name, params, 0, status);
3733 if (m.find()) {
3734 // The param exists. Convert the string to an int.
3735 char valString[100];
3736 int32_t paramLength = m.end(1, status) - m.start(1, status);
3737 if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3738 paramLength = (int32_t)(sizeof(valString)-2);
3739 }
3740 params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3741 val = strtol(valString, NULL, 10);
3742
3743 // Delete this parameter from the params string.
3744 m.reset();
3745 params = m.replaceFirst("", status);
3746 }
3747 U_ASSERT(U_SUCCESS(status));
3748 return val;
3749}
3750#endif
3751
51004dcb 3752#if !UCONFIG_NO_REGULAR_EXPRESSIONS
73c04bcf
A
3753static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3754 BreakIterator *bi,
3755 int expected[],
3756 int expectedcount)
3757{
3758 int count = 0;
3759 int i = 0;
3760 int forward[50];
3761 bi->setText(ustr);
3762 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3763 forward[count] = i;
3764 if (count < expectedcount && expected[count] != i) {
3765 test->errln("break forward test failed: expected %d but got %d",
3766 expected[count], i);
3767 break;
3768 }
3769 count ++;
3770 }
3771 if (count != expectedcount) {
3772 printStringBreaks(ustr, expected, expectedcount);
3773 test->errln("break forward test failed: missed %d match",
3774 expectedcount - count);
3775 return;
3776 }
3777 // testing boundaries
3778 for (i = 1; i < expectedcount; i ++) {
3779 int j = expected[i - 1];
3780 if (!bi->isBoundary(j)) {
3781 printStringBreaks(ustr, expected, expectedcount);
3782 test->errln("isBoundary() failed. Expected boundary at position %d", j);
3783 return;
3784 }
3785 for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3786 if (bi->isBoundary(j)) {
3787 printStringBreaks(ustr, expected, expectedcount);
3788 test->errln("isBoundary() failed. Not expecting boundary at position %d", j);
3789 return;
3790 }
3791 }
3792 }
3793
3794 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3795 count --;
3796 if (forward[count] != i) {
51004dcb 3797 printStringBreaks(ustr, expected, expectedcount);
73c04bcf
A
3798 test->errln("happy break test previous() failed: expected %d but got %d",
3799 forward[count], i);
3800 break;
3801 }
3802 }
3803 if (count != 0) {
3804 printStringBreaks(ustr, expected, expectedcount);
3805 test->errln("break test previous() failed: missed a match");
3806 return;
3807 }
3808
3809 // testing preceding
3810 for (i = 0; i < expectedcount - 1; i ++) {
3811 // int j = expected[i] + 1;
3812 int j = ustr.moveIndex32(expected[i], 1);
3813 for (; j <= expected[i + 1]; j ++) {
3814 if (bi->preceding(j) != expected[i]) {
3815 printStringBreaks(ustr, expected, expectedcount);
3816 test->errln("preceding(): Not expecting boundary at position %d", j);
3817 return;
3818 }
3819 }
3820 }
3821}
51004dcb 3822#endif
73c04bcf
A
3823
3824void RBBITest::TestWordBreaks(void)
3825{
3826#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3827
73c04bcf
A
3828 Locale locale("en");
3829 UErrorCode status = U_ZERO_ERROR;
3830 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3831 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
51004dcb
A
3832 // Replaced any C+J characters in a row with a random sequence of characters
3833 // of the same length to make our C+J segmentation not get in the way.
73c04bcf
A
3834 static const char *strlist[] =
3835 {
3836 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
51004dcb 3837 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
46f4442e 3838 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
73c04bcf 3839 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
51004dcb 3840 "\\uac00\\u3588\\u009c\\u0953\\u194b",
73c04bcf
A
3841 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3842 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
51004dcb 3843 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
73c04bcf
A
3844 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3845 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3846 "\\u2027\\U000e0067\\u0a47\\u00b7",
3847 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3848 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3849 "\\u0589\\U000e006e\\u0a42\\U000104a5",
51004dcb 3850 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
73c04bcf
A
3851 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3852 "\\u0027\\u11af\\U000e0057\\u0602",
3853 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3854 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3855 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3856 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
46f4442e 3857 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
73c04bcf
A
3858 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3859 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3860 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3861 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
51004dcb 3862 "\\u18f4\\U000e0049\\u20e7\\u2027",
73c04bcf
A
3863 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3864 "\\ua183\\u102d\\u0bec\\u003a",
3865 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3866 "\\u003a\\u0e57\\u0fad\\u002e",
3867 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3868 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3869 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3870 "\\u003a\\u0664\\u00b7\\u1fba",
3871 "\\u003b\\u0027\\u00b7\\u47a3",
51004dcb 3872 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
73c04bcf
A
3873 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3874 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3875 };
3876 int loop;
3877 if (U_FAILURE(status)) {
729e4ab9 3878 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
73c04bcf
A
3879 return;
3880 }
2ca993e8 3881 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
73c04bcf 3882 // printf("looping %d\n", loop);
46f4442e 3883 UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
73c04bcf
A
3884 // RBBICharMonkey monkey;
3885 RBBIWordMonkey monkey;
3886
3887 int expected[50];
3888 int expectedcount = 0;
3889
3890 monkey.setText(ustr);
3891 int i;
3892 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3893 expected[expectedcount ++] = i;
3894 }
3895
3896 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3897 }
3898 delete bi;
3899#endif
3900}
3901
3902void RBBITest::TestWordBoundary(void)
3903{
3904 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3905 Locale locale("en");
3906 UErrorCode status = U_ZERO_ERROR;
3907 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3908 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3909 UChar str[50];
3910 static const char *strlist[] =
3911 {
3912 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3913 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3914 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3915 "\\u2027\\U000e0067\\u0a47\\u00b7",
3916 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3917 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3918 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3919 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3920 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3921 "\\u0027\\u11af\\U000e0057\\u0602",
3922 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3923 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3924 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3925 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3926 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
51004dcb 3927 "\\U000e0065\\u302c\\u09ee\\U000e0068",
73c04bcf
A
3928 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3929 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3930 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3931 "\\u58f4\\U000e0049\\u20e7\\u2027",
51004dcb 3932 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
73c04bcf
A
3933 "\\ua183\\u102d\\u0bec\\u003a",
3934 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3935 "\\u003a\\u0e57\\u0fad\\u002e",
3936 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3937 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3938 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3939 "\\u003a\\u0664\\u00b7\\u1fba",
3940 "\\u003b\\u0027\\u00b7\\u47a3",
3941 };
3942 int loop;
3943 if (U_FAILURE(status)) {
729e4ab9 3944 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
73c04bcf
A
3945 return;
3946 }
2ca993e8 3947 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
73c04bcf
A
3948 // printf("looping %d\n", loop);
3949 u_unescape(strlist[loop], str, 20);
3950 UnicodeString ustr(str);
3951 int forward[50];
3952 int count = 0;
3953
3954 bi->setText(ustr);
3955 int prev = 0;
3956 int i;
3957 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3958 forward[count ++] = i;
3959 if (i > prev) {
3960 int j;
3961 for (j = prev + 1; j < i; j ++) {
3962 if (bi->isBoundary(j)) {
3963 printStringBreaks(ustr, forward, count);
3964 errln("happy boundary test failed: expected %d not a boundary",
3965 j);
3966 return;
3967 }
3968 }
3969 }
3970 if (!bi->isBoundary(i)) {
3971 printStringBreaks(ustr, forward, count);
3972 errln("happy boundary test failed: expected %d a boundary",
3973 i);
3974 return;
3975 }
3976 prev = i;
3977 }
3978 }
3979 delete bi;
3980}
3981
3982void RBBITest::TestLineBreaks(void)
3983{
3984#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3985 Locale locale("en");
3986 UErrorCode status = U_ZERO_ERROR;
3987 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3988 const int32_t STRSIZE = 50;
3989 UChar str[STRSIZE];
3990 static const char *strlist[] =
3991 {
3992 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3993 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3994 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3995 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3996 "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3997 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3998 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3999 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
4000 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
4001 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
73c04bcf
A
4002 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
4003 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
4004 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
4005 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
4006 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
4007 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
4008 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
4009 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
4010 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
4011 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
4012 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
4013 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
4014 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
4015 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
4016 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
73c04bcf
A
4017 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
4018 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
4019 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
4020 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
4021 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
73c04bcf
A
4022 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
4023 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
73c04bcf
A
4024 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
4025 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
4026 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
4027 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
4028 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
4029 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
73c04bcf
A
4030 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
4031 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
4032 };
4033 int loop;
4034 TEST_ASSERT_SUCCESS(status);
4035 if (U_FAILURE(status)) {
4036 return;
4037 }
2ca993e8 4038 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
73c04bcf
A
4039 // printf("looping %d\n", loop);
4040 int32_t t = u_unescape(strlist[loop], str, STRSIZE);
4041 if (t >= STRSIZE) {
4042 TEST_ASSERT(FALSE);
4043 continue;
4044 }
4045
46f4442e 4046
73c04bcf
A
4047 UnicodeString ustr(str);
4048 RBBILineMonkey monkey;
4049 if (U_FAILURE(monkey.deferredStatus)) {
4050 continue;
4051 }
4052
4053 const int EXPECTEDSIZE = 50;
4054 int expected[EXPECTEDSIZE];
4055 int expectedcount = 0;
4056
4057 monkey.setText(ustr);
4058 int i;
4059 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4060 if (expectedcount >= EXPECTEDSIZE) {
4061 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
4062 return;
4063 }
4064 expected[expectedcount ++] = i;
4065 }
4066
4067 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4068 }
4069 delete bi;
4070#endif
4071}
4072
4073void RBBITest::TestSentBreaks(void)
4074{
4075#if !UCONFIG_NO_REGULAR_EXPRESSIONS
4076 Locale locale("en");
4077 UErrorCode status = U_ZERO_ERROR;
4078 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
4079 UChar str[200];
4080 static const char *strlist[] =
4081 {
4082 "Now\ris\nthe\r\ntime\n\rfor\r\r",
4083 "This\n",
4084 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
4085 "\"Sentence ending with a quote.\" Bye.",
4086 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
4087 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
4088 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
4089 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
4090 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
4091 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
4092 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
4093 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
4094 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
4095 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
4096 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
4097 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
4098 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
4099 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
4100 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
4101 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
4102 };
4103 int loop;
4104 if (U_FAILURE(status)) {
729e4ab9 4105 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
73c04bcf
A
4106 return;
4107 }
2ca993e8
A
4108 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
4109 u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
73c04bcf
A
4110 UnicodeString ustr(str);
4111
4112 RBBISentMonkey monkey;
4113 if (U_FAILURE(monkey.deferredStatus)) {
4114 continue;
4115 }
4116
4117 const int EXPECTEDSIZE = 50;
4118 int expected[EXPECTEDSIZE];
4119 int expectedcount = 0;
4120
4121 monkey.setText(ustr);
4122 int i;
4123 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4124 if (expectedcount >= EXPECTEDSIZE) {
4125 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
4126 return;
4127 }
4128 expected[expectedcount ++] = i;
4129 }
4130
4131 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4132 }
4133 delete bi;
4134#endif
4135}
4136
f3c0d7a5 4137void RBBITest::TestMonkey() {
73c04bcf
A
4138#if !UCONFIG_NO_REGULAR_EXPRESSIONS
4139
4140 UErrorCode status = U_ZERO_ERROR;
4141 int32_t loopCount = 500;
4142 int32_t seed = 1;
4143 UnicodeString breakType = "all";
4144 Locale locale("en");
4145 UBool useUText = FALSE;
4146
4147 if (quick == FALSE) {
4148 loopCount = 10000;
4149 }
4150
f3c0d7a5
A
4151 if (fTestParams) {
4152 UnicodeString p(fTestParams);
73c04bcf
A
4153 loopCount = getIntParam("loop", p, loopCount);
4154 seed = getIntParam("seed", p, seed);
4155
4156 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
4157 if (m.find()) {
4158 breakType = m.group(1, status);
4159 m.reset();
4160 p = m.replaceFirst("", status);
4161 }
4162
4163 RegexMatcher u(" *utext", p, 0, status);
4164 if (u.find()) {
4165 useUText = TRUE;
4166 u.reset();
4167 p = u.replaceFirst("", status);
4168 }
4169
4170
4171 // m.reset(p);
46f4442e 4172 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
73c04bcf
A
4173 // Each option is stripped out of the option string as it is processed.
4174 // All options have been checked. The option string should have been completely emptied..
4175 char buf[100];
4176 p.extract(buf, sizeof(buf), NULL, status);
4177 buf[sizeof(buf)-1] = 0;
4178 errln("Unrecognized or extra parameter: %s\n", buf);
4179 return;
4180 }
4181
4182 }
4183
4184 if (breakType == "char" || breakType == "all") {
4185 RBBICharMonkey m;
4186 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
4187 if (U_SUCCESS(status)) {
4188 RunMonkey(bi, m, "char", seed, loopCount, useUText);
4189 if (breakType == "all" && useUText==FALSE) {
4190 // Also run a quick test with UText when "all" is specified
4191 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
4192 }
4193 }
4194 else {
729e4ab9 4195 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
73c04bcf
A
4196 }
4197 delete bi;
4198 }
4199
4200 if (breakType == "word" || breakType == "all") {
4201 logln("Word Break Monkey Test");
4202 RBBIWordMonkey m;
4203 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
4204 if (U_SUCCESS(status)) {
4205 RunMonkey(bi, m, "word", seed, loopCount, useUText);
4206 }
4207 else {
729e4ab9 4208 errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
73c04bcf
A
4209 }
4210 delete bi;
4211 }
4212
4213 if (breakType == "line" || breakType == "all") {
4214 logln("Line Break Monkey Test");
4215 RBBILineMonkey m;
4216 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
4217 if (loopCount >= 10) {
4218 loopCount = loopCount / 5; // Line break runs slower than the others.
4219 }
4220 if (U_SUCCESS(status)) {
4221 RunMonkey(bi, m, "line", seed, loopCount, useUText);
4222 }
4223 else {
729e4ab9 4224 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
73c04bcf
A
4225 }
4226 delete bi;
4227 }
4228
46f4442e 4229 if (breakType == "sent" || breakType == "all" ) {
73c04bcf
A
4230 logln("Sentence Break Monkey Test");
4231 RBBISentMonkey m;
4232 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
4233 if (loopCount >= 10) {
4234 loopCount = loopCount / 10; // Sentence runs slower than the other break types
4235 }
4236 if (U_SUCCESS(status)) {
4237 RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
4238 }
4239 else {
729e4ab9 4240 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
73c04bcf
A
4241 }
4242 delete bi;
4243 }
4244
4245#endif
4246}
4247
4248//
4249// Run a RBBI monkey test. Common routine, for all break iterator types.
4250// Parameters:
4251// bi - the break iterator to use
4252// mk - MonkeyKind, abstraction for obtaining expected results
4253// name - Name of test (char, word, etc.) for use in error messages
4254// seed - Seed for starting random number generator (parameter from user)
4255// numIterations
4256//
46f4442e 4257void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed,
73c04bcf
A
4258 int32_t numIterations, UBool useUText) {
4259
4260#if !UCONFIG_NO_REGULAR_EXPRESSIONS
4261
4262 const int32_t TESTSTRINGLEN = 500;
4263 UnicodeString testText;
4264 int32_t numCharClasses;
4265 UVector *chClasses;
4266 int expected[TESTSTRINGLEN*2 + 1];
4267 int expectedCount = 0;
4268 char expectedBreaks[TESTSTRINGLEN*2 + 1];
4269 char forwardBreaks[TESTSTRINGLEN*2 + 1];
4270 char reverseBreaks[TESTSTRINGLEN*2+1];
4271 char isBoundaryBreaks[TESTSTRINGLEN*2+1];
4272 char followingBreaks[TESTSTRINGLEN*2+1];
4273 char precedingBreaks[TESTSTRINGLEN*2+1];
4274 int i;
4275 int loopCount = 0;
4276
4277 m_seed = seed;
4278
4279 numCharClasses = mk.charClasses()->size();
4280 chClasses = mk.charClasses();
4281
4282 // Check for errors that occured during the construction of the MonkeyKind object.
4283 // Can't report them where they occured because errln() is a method coming from intlTest,
4284 // and is not visible outside of RBBITest :-(
4285 if (U_FAILURE(mk.deferredStatus)) {
4286 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
4287 return;
4288 }
4289
4290 // Verify that the character classes all have at least one member.
4291 for (i=0; i<numCharClasses; i++) {
4292 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
4293 if (s == NULL || s->size() == 0) {
4294 errln("Character Class #%d is null or of zero size.", i);
4295 return;
4296 }
4297 }
4298
4299 while (loopCount < numIterations || numIterations == -1) {
4300 if (numIterations == -1 && loopCount % 10 == 0) {
4301 // If test is running in an infinite loop, display a periodic tic so
4302 // we can tell that it is making progress.
4303 fprintf(stderr, ".");
4304 }
4305 // Save current random number seed, so that we can recreate the random numbers
4306 // for this loop iteration in event of an error.
4307 seed = m_seed;
4308
4309 // Populate a test string with data.
4310 testText.truncate(0);
4311 for (i=0; i<TESTSTRINGLEN; i++) {
4312 int32_t aClassNum = m_rand() % numCharClasses;
4313 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4314 int32_t charIdx = m_rand() % classSet->size();
4315 UChar32 c = classSet->charAt(charIdx);
4316 if (c < 0) { // TODO: deal with sets containing strings.
2ca993e8 4317 errln("%s:%d c < 0", __FILE__, __LINE__);
73c04bcf
A
4318 break;
4319 }
2ca993e8
A
4320 // Do not assemble a supplementary character from randomly generated separate surrogates.
4321 // (It could be a dictionary character)
4322 if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
4323 continue;
4324 }
4325
73c04bcf
A
4326 testText.append(c);
4327 }
4328
4329 // Calculate the expected results for this test string.
4330 mk.setText(testText);
4331 memset(expectedBreaks, 0, sizeof(expectedBreaks));
4332 expectedBreaks[0] = 1;
4333 int32_t breakPos = 0;
4334 expectedCount = 0;
4335 for (;;) {
4336 breakPos = mk.next(breakPos);
4337 if (breakPos == -1) {
4338 break;
4339 }
4340 if (breakPos > testText.length()) {
4341 errln("breakPos > testText.length()");
4342 }
4343 expectedBreaks[breakPos] = 1;
4344 U_ASSERT(expectedCount<testText.length());
4345 expected[expectedCount ++] = breakPos;
57a6839d
A
4346 (void)expected; // Set but not used warning.
4347 // TODO (andy): check it out.
73c04bcf
A
4348 }
4349
4350 // Find the break positions using forward iteration
4351 memset(forwardBreaks, 0, sizeof(forwardBreaks));
4352 if (useUText) {
4353 UErrorCode status = U_ZERO_ERROR;
4354 UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4355 // testUText = utext_openUnicodeString(testUText, &testText, &status);
4356 bi->setText(testUText, status);
4357 TEST_ASSERT_SUCCESS(status);
4358 utext_close(testUText); // The break iterator does a shallow clone of the UText
4359 // This UText can be closed immediately, so long as the
4360 // testText string continues to exist.
4361 } else {
4362 bi->setText(testText);
4363 }
4364
4365 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4366 if (i < 0 || i > testText.length()) {
4367 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4368 break;
4369 }
4370 forwardBreaks[i] = 1;
4371 }
4372
4373 // Find the break positions using reverse iteration
4374 memset(reverseBreaks, 0, sizeof(reverseBreaks));
4375 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4376 if (i < 0 || i > testText.length()) {
4377 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4378 break;
4379 }
4380 reverseBreaks[i] = 1;
4381 }
4382
4383 // Find the break positions using isBoundary() tests.
4384 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4385 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4386 for (i=0; i<=testText.length(); i++) {
4387 isBoundaryBreaks[i] = bi->isBoundary(i);
4388 }
4389
4390
4391 // Find the break positions using the following() function.
4392 // printf(".");
4393 memset(followingBreaks, 0, sizeof(followingBreaks));
4394 int32_t lastBreakPos = 0;
4395 followingBreaks[0] = 1;
4396 for (i=0; i<testText.length(); i++) {
4397 breakPos = bi->following(i);
4398 if (breakPos <= i ||
4399 breakPos < lastBreakPos ||
4400 breakPos > testText.length() ||
729e4ab9 4401 (breakPos > lastBreakPos && lastBreakPos > i)) {
73c04bcf
A
4402 errln("%s break monkey test: "
4403 "Out of range value returned by BreakIterator::following().\n"
4404 "Random seed=%d index=%d; following returned %d; lastbreak=%d",
4405 name, seed, i, breakPos, lastBreakPos);
4406 break;
4407 }
4408 followingBreaks[breakPos] = 1;
4409 lastBreakPos = breakPos;
4410 }
4411
4412 // Find the break positions using the preceding() function.
46f4442e 4413 memset(precedingBreaks, 0, sizeof(precedingBreaks));
73c04bcf
A
4414 lastBreakPos = testText.length();
4415 precedingBreaks[testText.length()] = 1;
4416 for (i=testText.length(); i>0; i--) {
4417 breakPos = bi->preceding(i);
4418 if (breakPos >= i ||
4419 breakPos > lastBreakPos ||
729e4ab9
A
4420 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4421 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
73c04bcf
A
4422 errln("%s break monkey test: "
4423 "Out of range value returned by BreakIterator::preceding().\n"
4424 "index=%d; prev returned %d; lastBreak=%d" ,
4425 name, i, breakPos, lastBreakPos);
46f4442e
A
4426 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4427 precedingBreaks[i] = 2; // Forces an error.
4428 }
73c04bcf 4429 } else {
46f4442e
A
4430 if (breakPos >= 0) {
4431 precedingBreaks[breakPos] = 1;
2ca993e8 4432 }
73c04bcf
A
4433 lastBreakPos = breakPos;
4434 }
4435 }
4436
4437 // Compare the expected and actual results.
4438 for (i=0; i<=testText.length(); i++) {
4439 const char *errorType = NULL;
4440 if (forwardBreaks[i] != expectedBreaks[i]) {
4441 errorType = "next()";
4442 } else if (reverseBreaks[i] != forwardBreaks[i]) {
4443 errorType = "previous()";
4444 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4445 errorType = "isBoundary()";
4446 } else if (followingBreaks[i] != expectedBreaks[i]) {
4447 errorType = "following()";
4448 } else if (precedingBreaks[i] != expectedBreaks[i]) {
4449 errorType = "preceding()";
4450 }
4451
4452
4453 if (errorType != NULL) {
4454 // Format a range of the test text that includes the failure as
4455 // a data item that can be included in the rbbi test data file.
4456
4457 // Start of the range is the last point where expected and actual results
4458 // both agreed that there was a break position.
4459 int startContext = i;
4460 int32_t count = 0;
4461 for (;;) {
4462 if (startContext==0) { break; }
4463 startContext --;
4464 if (expectedBreaks[startContext] != 0) {
4465 if (count == 2) break;
4466 count ++;
4467 }
4468 }
4469
4470 // End of range is two expected breaks past the start position.
4471 int endContext = i + 1;
4472 int ci;
4473 for (ci=0; ci<2; ci++) { // Number of items to include in error text.
4474 for (;;) {
4475 if (endContext >= testText.length()) {break;}
4476 if (expectedBreaks[endContext-1] != 0) {
4477 if (count == 0) break;
4478 count --;
4479 }
4480 endContext ++;
4481 }
4482 }
4483
4484 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4485 UnicodeString errorText = "<data>";
4486 /***if (strcmp(errorType, "next()") == 0) {
4487 startContext = 0;
4488 endContext = testText.length();
4489
4490 printStringBreaks(testText, expected, expectedCount);
4491 }***/
4492
4493 for (ci=startContext; ci<endContext;) {
4494 UnicodeString hexChars("0123456789abcdef");
4495 UChar32 c;
4496 int bn;
4497 c = testText.char32At(ci);
4498 if (ci == i) {
4499 // This is the location of the error.
4500 errorText.append("<?>");
4501 } else if (expectedBreaks[ci] != 0) {
4502 // This a non-error expected break position.
4503 errorText.append("\\");
4504 }
4505 if (c < 0x10000) {
4506 errorText.append("\\u");
4507 for (bn=12; bn>=0; bn-=4) {
4508 errorText.append(hexChars.charAt((c>>bn)&0xf));
4509 }
4510 } else {
4511 errorText.append("\\U");
4512 for (bn=28; bn>=0; bn-=4) {
4513 errorText.append(hexChars.charAt((c>>bn)&0xf));
4514 }
4515 }
4516 ci = testText.moveIndex32(ci, 1);
4517 }
4518 errorText.append("\\");
4519 errorText.append("</data>\n");
4520
4521 // Output the error
4522 char charErrorTxt[500];
4523 UErrorCode status = U_ZERO_ERROR;
4524 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4525 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4388f060 4526 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
2ca993e8 4527
4388f060
A
4528 errln("%s break monkey test error [%s]. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s",
4529 name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
73c04bcf
A
4530 errorType, seed, i, charErrorTxt);
4531 break;
4532 }
4533 }
4534
4535 loopCount++;
4536 }
4537#endif
4538}
4539
729e4ab9
A
4540
4541// Bug 5532. UTF-8 based UText fails in dictionary code.
4542// This test checks the initial patch,
4543// which is to just keep it from crashing. Correct word boundaries
4544// await a proper fix to the dictionary code.
4545//
4546void RBBITest::TestBug5532(void) {
4547 // Text includes a mixture of Thai and Latin.
4548 const unsigned char utf8Data[] = {
4549 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
2ca993e8 4550 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
729e4ab9
A
4551 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4552 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4553 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
2ca993e8
A
4554 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4555 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4556 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4557 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4558 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
729e4ab9
A
4559 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4560
4561 UErrorCode status = U_ZERO_ERROR;
4562 UText utext=UTEXT_INITIALIZER;
4563 utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4564 TEST_ASSERT_SUCCESS(status);
4565
4566 BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4567 TEST_ASSERT_SUCCESS(status);
4568 if (U_SUCCESS(status)) {
4569 bi->setText(&utext, status);
4570 TEST_ASSERT_SUCCESS(status);
4571
4572 int32_t breakCount = 0;
4573 int32_t previousBreak = -1;
4574 for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4575 // For now, just make sure that the break iterator doesn't hang.
4576 TEST_ASSERT(previousBreak < bi->current());
4577 previousBreak = bi->current();
4578 }
4579 TEST_ASSERT(breakCount > 0);
4580 }
4581 delete bi;
4582 utext_close(&utext);
4583}
4584
4585
51004dcb
A
4586void RBBITest::TestBug9983(void) {
4587 UnicodeString text = UnicodeString("\\u002A" // * Other
4588 "\\uFF65" // Other
4589 "\\u309C" // Katakana
4590 "\\uFF9F" // Extend
4591 "\\uFF65" // Other
4592 "\\u0020" // Other
4593 "\\u0000").unescape();
4594
4595 UErrorCode status = U_ZERO_ERROR;
4596 LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4597 BreakIterator::createWordInstance(Locale::getRoot(), status)));
4598 TEST_ASSERT_SUCCESS(status);
57a6839d
A
4599 LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4600 BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4601 TEST_ASSERT_SUCCESS(status);
51004dcb
A
4602 if (U_FAILURE(status)) {
4603 return;
4604 }
57a6839d
A
4605 int32_t offset, rstatus, iterationCount;
4606
51004dcb 4607 brkiter->setText(text);
51004dcb 4608 brkiter->last();
57a6839d 4609 iterationCount = 0;
51004dcb
A
4610 while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4611 iterationCount++;
4612 rstatus = brkiter->getRuleStatus();
57a6839d
A
4613 (void)rstatus; // Suppress set but not used warning.
4614 if (iterationCount >= 10) {
2ca993e8 4615 break;
57a6839d
A
4616 }
4617 }
4618 TEST_ASSERT(iterationCount == 6);
4619
4620 brkiterPOSIX->setText(text);
4621 brkiterPOSIX->last();
4622 iterationCount = 0;
4623 while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4624 iterationCount++;
4625 rstatus = brkiterPOSIX->getRuleStatus();
4626 (void)rstatus; // Suppress set but not used warning.
51004dcb 4627 if (iterationCount >= 10) {
2ca993e8 4628 break;
51004dcb
A
4629 }
4630 }
4631 TEST_ASSERT(iterationCount == 6);
4632}
4633
f3c0d7a5
A
4634// Bug 7547 - verify that building a break itereator from empty rules produces an error.
4635//
4636void RBBITest::TestBug7547() {
4637 UnicodeString rules;
4638 UErrorCode status = U_ZERO_ERROR;
4639 UParseError parseError;
4640 RuleBasedBreakIterator breakIterator(rules, parseError, status);
4641 if (status != U_BRK_RULE_SYNTAX) {
4642 errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status));
4643 }
4644 if (parseError.line != 1 || parseError.offset != 0) {
4645 errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset);
4646 }
4647}
4648
4649
4650void RBBITest::TestBug12797() {
4651 UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
4652 UErrorCode status = U_ZERO_ERROR;
4653 UParseError parseError;
4654 RuleBasedBreakIterator bi(rules, parseError, status);
4655 if (U_FAILURE(status)) {
4656 errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status));
4657 return;
4658 }
4659 UnicodeString text = "abc";
4660 bi.setText(text);
4661 bi.first();
4662 int32_t boundary = bi.next();
4663 if (boundary != 3) {
4664 errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary);
4665 }
4666}
4667
4668void RBBITest::TestBug12918() {
4669 // This test triggers an assertion failure in dictbe.cpp
4670 const UChar *crasherString = u"\u3325\u4a16";
4671 UErrorCode status = U_ZERO_ERROR;
4672 UBreakIterator* iter = ubrk_open(UBRK_WORD, NULL, crasherString, -1, &status);
4673 if (U_FAILURE(status)) {
4674 dataerrln("%s:%d status = %s", __FILE__, __LINE__, u_errorName(status));
4675 return;
4676 }
4677 ubrk_first(iter);
4678 int32_t pos = 0;
4679 int32_t lastPos = -1;
4680 while((pos = ubrk_next(iter)) != UBRK_DONE) {
4681 if (pos <= lastPos) {
4682 errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__, __LINE__, pos, lastPos);
4683 break;
4684 }
4685 }
4686 ubrk_close(iter);
4687}
4688
4689void RBBITest::TestBug12932() {
4690 // Node Stack overflow in the RBBI rule parser caused a seg fault.
4691 UnicodeString ruleStr(
4692 "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4693 "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4694 "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((()))"
4695 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4696 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4697 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))");
4698
4699 UErrorCode status = U_ZERO_ERROR;
4700 UParseError parseError;
4701 RuleBasedBreakIterator rbbi(ruleStr, parseError, status);
4702 if (status != U_BRK_RULE_SYNTAX) {
4703 errln("%s:%d expected U_BRK_RULE_SYNTAX, got %s",
4704 __FILE__, __LINE__, u_errorName(status));
4705 }
4706}
4707
4708
4709// Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt
4710// remain undevided by ICU char, word and line break.
4711void RBBITest::TestEmoji() {
4712 UErrorCode status = U_ZERO_ERROR;
4713
4714 CharString testFileName;
4715 testFileName.append(IntlTest::getSourceTestData(status), status);
4716 testFileName.appendPathPart("emoji-test.txt", status);
4717 if (U_FAILURE(status)) {
4718 errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4719 return;
4720 }
4721 logln("Opening data file %s\n", testFileName.data());
4722
4723 int len;
4724 UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
4725 if (U_FAILURE(status) || testFile == NULL) {
4726 errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4727 return;
4728 }
4729 UnicodeString testFileAsString(testFile, len);
4730 delete [] testFile;
4731
4732 RegexMatcher lineMatcher(u"^.*?$", testFileAsString, UREGEX_MULTILINE, status);
4733 RegexMatcher hexMatcher(u"\\s*([a-f0-9]*)", UREGEX_CASE_INSENSITIVE, status);
4734 // hexMatcher group(1) is a hex number, or empty string if no hex number present.
4735 int32_t lineNumber = 0;
4736
4737 LocalPointer<BreakIterator> charBreaks(BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4738 LocalPointer<BreakIterator> wordBreaks(BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4739 LocalPointer<BreakIterator> lineBreaks(BreakIterator::createLineInstance(Locale::getEnglish(), status), status);
4740 if (U_FAILURE(status)) {
4741 dataerrln("%s:%d %s while opening break iterators", __FILE__, __LINE__, u_errorName(status));
4742 return;
4743 }
4744
4745 while (lineMatcher.find()) {
4746 ++lineNumber;
4747 UnicodeString line = lineMatcher.group(status);
4748 hexMatcher.reset(line);
4749 UnicodeString testString; // accumulates the emoji sequence.
4750 while (hexMatcher.find() && hexMatcher.group(1, status).length() > 0) {
4751 UnicodeString hex = hexMatcher.group(1, status);
4752 if (hex.length() > 8) {
4753 errln("%s:%d emoji-test.txt:%d invalid code point %s", __FILE__, __LINE__, lineNumber, CStr(hex)());
4754 break;
4755 }
4756 CharString hex8;
4757 hex8.appendInvariantChars(hex, status);
4758 UChar32 c = (UChar32)strtol(hex8.data(), NULL, 16);
4759 if (c<=0x10ffff) {
4760 testString.append(c);
4761 } else {
4762 errln("%s:%d emoji-test.txt:%d Error: Unicode Character %s value out of range.",
4763 __FILE__, __LINE__, lineNumber, hex8.data());
4764 break;
4765 }
4766 }
4767
4768 if (testString.length() > 1) {
4769 charBreaks->setText(testString);
4770 charBreaks->first();
4771 int32_t firstBreak = charBreaks->next();
4772 if (testString.length() != firstBreak) {
4773 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4774 __FILE__, __LINE__, lineNumber, firstBreak);
4775 }
4776 wordBreaks->setText(testString);
4777 wordBreaks->first();
4778 firstBreak = wordBreaks->next();
4779 if (testString.length() != firstBreak) {
4780 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4781 __FILE__, __LINE__, lineNumber, firstBreak);
4782 }
4783 lineBreaks->setText(testString);
4784 lineBreaks->first();
4785 firstBreak = lineBreaks->next();
4786 if (testString.length() != firstBreak) {
4787 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4788 __FILE__, __LINE__, lineNumber, firstBreak);
4789 }
4790 }
4791 }
4792}
4793
51004dcb 4794
73c04bcf
A
4795//
4796// TestDebug - A place-holder test for debugging purposes.
4797// For putting in fragments of other tests that can be invoked
4798// for tracing without a lot of unwanted extra stuff happening.
4799//
4800void RBBITest::TestDebug(void) {
73c04bcf
A
4801}
4802
4388f060
A
4803void RBBITest::TestProperties() {
4804 UErrorCode errorCode = U_ZERO_ERROR;
4805 UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4806 if (!prependSet.isEmpty()) {
4807 errln(
4808 "[:GCB=Prepend:] is not empty any more. "
4809 "Uncomment relevant lines in source/data/brkitr/char.txt and "
4810 "change this test to the opposite condition.");
4811 }
4812}
4813
73c04bcf 4814#endif /* #if !UCONFIG_NO_BREAK_ITERATION */