]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/intltest/rbbitst.cpp
ICU-511.34.tar.gz
[apple/icu.git] / icuSources / test / intltest / rbbitst.cpp
1 /********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 1999-2013, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6 /************************************************************************
7 * Date Name Description
8 * 12/15/99 Madhu Creation.
9 * 01/12/2000 Madhu Updated for changed API and added new tests
10 ************************************************************************/
11
12 #include "utypeinfo.h" // for 'typeid' to work
13
14 #include "unicode/utypes.h"
15
16 #if !UCONFIG_NO_BREAK_ITERATION
17
18 #include "unicode/utypes.h"
19 #include "unicode/brkiter.h"
20 #include "unicode/rbbi.h"
21 #include "unicode/uchar.h"
22 #include "unicode/utf16.h"
23 #include "unicode/ucnv.h"
24 #include "unicode/schriter.h"
25 #include "unicode/uniset.h"
26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
27 #include "unicode/regex.h"
28 #endif
29 #include "unicode/ustring.h"
30 #include "unicode/utext.h"
31 #include "intltest.h"
32 #include "rbbitst.h"
33 #include <string.h>
34 #include "uvector.h"
35 #include "uvectr32.h"
36 #include <string.h>
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include "unicode/numfmt.h"
40 #include "unicode/uscript.h"
41
42 #define TEST_ASSERT(x) {if (!(x)) { \
43 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
44
45 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
46 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
47
48
49 //---------------------------------------------
50 // runIndexedTest
51 //---------------------------------------------
52
53
54 // Note: Before adding new tests to this file, check whether the desired test data can
55 // simply be added to the file testdata/rbbitest.txt. In most cases it can,
56 // it's much less work than writing a new test, diagnostic output in the event of failures
57 // is good, and the test data file will is shared with ICU4J, so eventually the test
58 // will run there as well, without additional effort.
59
60 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
61 {
62 if (exec) logln("TestSuite RuleBasedBreakIterator: ");
63
64 switch (index) {
65 #if !UCONFIG_NO_FILE_IO
66 case 0: name = "TestBug4153072";
67 if(exec) TestBug4153072(); break;
68 #else
69 case 0: name = "skip";
70 break;
71 #endif
72
73 case 1: name = "skip";
74 break;
75 case 2: name = "TestStatusReturn";
76 if(exec) TestStatusReturn(); break;
77
78 #if !UCONFIG_NO_FILE_IO
79 case 3: name = "TestUnicodeFiles";
80 if(exec) TestUnicodeFiles(); break;
81 case 4: name = "TestEmptyString";
82 if(exec) TestEmptyString(); break;
83 #else
84 case 3: case 4: name = "skip";
85 break;
86 #endif
87
88 case 5: name = "TestGetAvailableLocales";
89 if(exec) TestGetAvailableLocales(); break;
90
91 case 6: name = "TestGetDisplayName";
92 if(exec) TestGetDisplayName(); break;
93
94 #if !UCONFIG_NO_FILE_IO
95 case 7: name = "TestEndBehaviour";
96 if(exec) TestEndBehaviour(); break;
97 case 8: case 9: case 10: name = "skip";
98 break;
99 case 11: name = "TestWordBreaks";
100 if(exec) TestWordBreaks(); break;
101 case 12: name = "TestWordBoundary";
102 if(exec) TestWordBoundary(); break;
103 case 13: name = "TestLineBreaks";
104 if(exec) TestLineBreaks(); break;
105 case 14: name = "TestSentBreaks";
106 if(exec) TestSentBreaks(); break;
107 case 15: name = "TestExtended";
108 if(exec) TestExtended(); break;
109 #else
110 case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip";
111 break;
112 #endif
113
114 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
115 case 16:
116 name = "TestMonkey"; if(exec) TestMonkey(params); break;
117 #else
118 case 16:
119 name = "skip"; break;
120 #endif
121
122 #if !UCONFIG_NO_FILE_IO
123 case 17: name = "TestBug3818";
124 if(exec) TestBug3818(); break;
125 #else
126 case 17: name = "skip";
127 break;
128 #endif
129
130 case 18: name = "skip";
131 break;
132 case 19: name = "TestDebug";
133 if(exec) TestDebug(); break;
134 case 20: name = "skip";
135 break;
136
137 #if !UCONFIG_NO_FILE_IO
138 case 21: name = "TestBug5775";
139 if (exec) TestBug5775(); break;
140 #else
141 case 21: name = "skip";
142 break;
143 #endif
144
145 case 22: name = "TestBug9983";
146 if (exec) TestBug9983(); break;
147 case 23: name = "TestDictRules";
148 if (exec) TestDictRules(); break;
149 case 24: name = "TestBug5532";
150 if (exec) TestBug5532(); break;
151 default: name = ""; break; //needed to end loop
152 }
153 }
154
155
156 //---------------------------------------------------------------------------
157 //
158 // class BITestData Holds a set of Break iterator test data and results
159 // Includes
160 // - the string data to be broken
161 // - a vector of the expected break positions.
162 // - a vector of source line numbers for the data,
163 // (to help see where errors occured.)
164 // - The expected break tag values.
165 // - Vectors of actual break positions and tag values.
166 // - Functions for comparing actual with expected and
167 // reporting errors.
168 //
169 //----------------------------------------------------------------------------
170 class BITestData {
171 public:
172 UnicodeString fDataToBreak;
173 UVector fExpectedBreakPositions;
174 UVector fExpectedTags;
175 UVector fLineNum;
176 UVector fActualBreakPositions; // Test Results.
177 UVector fActualTags;
178
179 BITestData(UErrorCode &status);
180 void addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
181 void checkResults(const char *heading, RBBITest *test);
182 void err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
183 void clearResults();
184 };
185
186 //
187 // Constructor.
188 //
189 BITestData::BITestData(UErrorCode &status)
190 : fExpectedBreakPositions(status), fExpectedTags(status), fLineNum(status), fActualBreakPositions(status),
191 fActualTags(status)
192 {
193 }
194
195 //
196 // addDataChunk. Add a section (non-breaking) piece if data to the test data.
197 // The macro form collects the line number, which is helpful
198 // when tracking down failures.
199 //
200 // A null data item is inserted at the start of each test's data
201 // to put the starting zero into the data list. The position saved for
202 // each non-null item is its ending position.
203 //
204 #define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status);
205 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
206 if (U_FAILURE(status)) {return;}
207 if (data != NULL) {
208 fDataToBreak.append(CharsToUnicodeString(data));
209 }
210 fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
211 fExpectedTags.addElement(tag, status);
212 fLineNum.addElement(lineNum, status);
213 }
214
215
216 //
217 // checkResults. Compare the actual and expected break positions, report any differences.
218 //
219 void BITestData::checkResults(const char *heading, RBBITest *test) {
220 int32_t expectedIndex = 0;
221 int32_t actualIndex = 0;
222
223 for (;;) {
224 // If we've run through both the expected and actual results vectors, we're done.
225 // break out of the loop.
226 if (expectedIndex >= fExpectedBreakPositions.size() &&
227 actualIndex >= fActualBreakPositions.size()) {
228 break;
229 }
230
231
232 if (expectedIndex >= fExpectedBreakPositions.size()) {
233 err(heading, test, expectedIndex-1, actualIndex);
234 actualIndex++;
235 continue;
236 }
237
238 if (actualIndex >= fActualBreakPositions.size()) {
239 err(heading, test, expectedIndex, actualIndex-1);
240 expectedIndex++;
241 continue;
242 }
243
244 if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
245 err(heading, test, expectedIndex, actualIndex);
246 // Try to resync the positions of the indices, to avoid a rash of spurious erros.
247 if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
248 actualIndex++;
249 } else {
250 expectedIndex++;
251 }
252 continue;
253 }
254
255 if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
256 test->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d",
257 heading, fLineNum.elementAt(expectedIndex),
258 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
259 }
260
261 actualIndex++;
262 expectedIndex++;
263 }
264 }
265
266 //
267 // err - An error was found. Report it, along with information about where the
268 // incorrectly broken test data appeared in the source file.
269 //
270 void BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
271 {
272 int32_t expected = fExpectedBreakPositions.elementAti(expectedIdx);
273 int32_t actual = fActualBreakPositions.elementAti(actualIdx);
274 int32_t o = 0;
275 int32_t line = fLineNum.elementAti(expectedIdx);
276 if (expectedIdx > 0) {
277 // The line numbers are off by one because a premature break occurs somewhere
278 // within the previous item, rather than at the start of the current (expected) item.
279 // We want to report the offset of the unexpected break from the start of
280 // this previous item.
281 o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
282 }
283 if (actual < expected) {
284 test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading, o, line, actual, expected);
285 } else {
286 test->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading, line, actual, expected);
287 }
288 }
289
290
291 void BITestData::clearResults() {
292 fActualBreakPositions.removeAllElements();
293 fActualTags.removeAllElements();
294 }
295
296
297 //--------------------------------------------------------------------------------------
298 //
299 // RBBITest constructor and destructor
300 //
301 //--------------------------------------------------------------------------------------
302
303 RBBITest::RBBITest() {
304 }
305
306
307 RBBITest::~RBBITest() {
308 }
309
310 //-----------------------------------------------------------------------------------
311 //
312 // Test for status {tag} return value from break rules.
313 // TODO: a more thorough test.
314 //
315 //-----------------------------------------------------------------------------------
316 void RBBITest::TestStatusReturn() {
317 UnicodeString rulesString1("$Letters = [:L:];\n"
318 "$Numbers = [:N:];\n"
319 "$Letters+{1};\n"
320 "$Numbers+{2};\n"
321 "Help\\ {4}/me\\!;\n"
322 "[^$Letters $Numbers];\n"
323 "!.*;\n", -1, US_INV);
324 UnicodeString testString1 = "abc123..abc Help me Help me!";
325 // 01234567890123456789012345678
326 int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
327 int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1};
328
329 UErrorCode status=U_ZERO_ERROR;
330 UParseError parseError;
331
332 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
333 if(U_FAILURE(status)) {
334 dataerrln("FAIL : in construction - %s", u_errorName(status));
335 } else {
336 int32_t pos;
337 int32_t i = 0;
338 bi->setText(testString1);
339 for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
340 if (pos != bounds1[i]) {
341 errln("FAIL: expected break at %d, got %d\n", bounds1[i], pos);
342 break;
343 }
344
345 int tag = bi->getRuleStatus();
346 if (tag != brkStatus[i]) {
347 errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
348 break;
349 }
350 i++;
351 }
352 }
353 delete bi;
354 }
355
356
357 static void printStringBreaks(UnicodeString ustr, int expected[],
358 int expectedcount)
359 {
360 UErrorCode status = U_ZERO_ERROR;
361 char name[100];
362 printf("code alpha extend alphanum type word sent line name\n");
363 int j;
364 for (j = 0; j < ustr.length(); j ++) {
365 if (expectedcount > 0) {
366 int k;
367 for (k = 0; k < expectedcount; k ++) {
368 if (j == expected[k]) {
369 printf("------------------------------------------------ %d\n",
370 j);
371 }
372 }
373 }
374 UChar32 c = ustr.char32At(j);
375 if (c > 0xffff) {
376 j ++;
377 }
378 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
379 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
380 u_isUAlphabetic(c),
381 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
382 u_isalnum(c),
383 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
384 u_charType(c),
385 U_SHORT_PROPERTY_NAME),
386 u_getPropertyValueName(UCHAR_WORD_BREAK,
387 u_getIntPropertyValue(c,
388 UCHAR_WORD_BREAK),
389 U_SHORT_PROPERTY_NAME),
390 u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
391 u_getIntPropertyValue(c,
392 UCHAR_SENTENCE_BREAK),
393 U_SHORT_PROPERTY_NAME),
394 u_getPropertyValueName(UCHAR_LINE_BREAK,
395 u_getIntPropertyValue(c,
396 UCHAR_LINE_BREAK),
397 U_SHORT_PROPERTY_NAME),
398 name);
399 }
400 }
401
402
403 void RBBITest::TestBug3818() {
404 UErrorCode status = U_ZERO_ERROR;
405
406 // Four Thai words...
407 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
408 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
409 UnicodeString thaiStr(thaiWordData);
410
411 RuleBasedBreakIterator* bi =
412 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status);
413 if (U_FAILURE(status) || bi == NULL) {
414 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
415 return;
416 }
417 bi->setText(thaiStr);
418
419 int32_t startOfSecondWord = bi->following(1);
420 if (startOfSecondWord != 4) {
421 errln("Fail at file %s, line %d expected start of word at 4, got %d",
422 __FILE__, __LINE__, startOfSecondWord);
423 }
424 startOfSecondWord = bi->following(0);
425 if (startOfSecondWord != 4) {
426 errln("Fail at file %s, line %d expected start of word at 4, got %d",
427 __FILE__, __LINE__, startOfSecondWord);
428 }
429 delete bi;
430 }
431
432 //----------------------------------------------------------------------------
433 //
434 // generalIteratorTest Given a break iterator and a set of test data,
435 // Run the tests and report the results.
436 //
437 //----------------------------------------------------------------------------
438 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
439 {
440
441 bi.setText(td.fDataToBreak);
442
443 testFirstAndNext(bi, td);
444
445 testLastAndPrevious(bi, td);
446
447 testFollowing(bi, td);
448 testPreceding(bi, td);
449 testIsBoundary(bi, td);
450 doMultipleSelectionTest(bi, td);
451 }
452
453
454 //
455 // testFirstAndNext. Run the iterator forwards in the obvious first(), next()
456 // kind of loop.
457 //
458 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
459 {
460 UErrorCode status = U_ZERO_ERROR;
461 int32_t p;
462 int32_t lastP = -1;
463 int32_t tag;
464
465 logln("Test first and next");
466 bi.setText(td.fDataToBreak);
467 td.clearResults();
468
469 for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
470 td.fActualBreakPositions.addElement(p, status); // Save result.
471 tag = bi.getRuleStatus();
472 td.fActualTags.addElement(tag, status);
473 if (p <= lastP) {
474 // If the iterator is not making forward progress, stop.
475 // No need to raise an error here, it'll be detected in the normal check of results.
476 break;
477 }
478 lastP = p;
479 }
480 td.checkResults("testFirstAndNext", this);
481 }
482
483
484 //
485 // TestLastAndPrevious. Run the iterator backwards, starting with last().
486 //
487 void RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td)
488 {
489 UErrorCode status = U_ZERO_ERROR;
490 int32_t p;
491 int32_t lastP = 0x7ffffffe;
492 int32_t tag;
493
494 logln("Test last and previous");
495 bi.setText(td.fDataToBreak);
496 td.clearResults();
497
498 for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
499 // Save break position. Insert it at start of vector of results, shoving
500 // already-saved results further towards the end.
501 td.fActualBreakPositions.insertElementAt(p, 0, status);
502 // bi.previous(); // TODO: Why does this fix things up????
503 // bi.next();
504 tag = bi.getRuleStatus();
505 td.fActualTags.insertElementAt(tag, 0, status);
506 if (p >= lastP) {
507 // If the iterator is not making progress, stop.
508 // No need to raise an error here, it'll be detected in the normal check of results.
509 break;
510 }
511 lastP = p;
512 }
513 td.checkResults("testLastAndPrevious", this);
514 }
515
516
517 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
518 {
519 UErrorCode status = U_ZERO_ERROR;
520 int32_t p;
521 int32_t tag;
522 int32_t lastP = -2; // A value that will never be returned as a break position.
523 // cannot be -1; that is returned for DONE.
524 int i;
525
526 logln("testFollowing():");
527 bi.setText(td.fDataToBreak);
528 td.clearResults();
529
530 // Save the starting point, since we won't get that out of following.
531 p = bi.first();
532 td.fActualBreakPositions.addElement(p, status); // Save result.
533 tag = bi.getRuleStatus();
534 td.fActualTags.addElement(tag, status);
535
536 for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
537 p = bi.following(i);
538 if (p != lastP) {
539 if (p == RuleBasedBreakIterator::DONE) {
540 break;
541 }
542 // We've reached a new break position. Save it.
543 td.fActualBreakPositions.addElement(p, status); // Save result.
544 tag = bi.getRuleStatus();
545 td.fActualTags.addElement(tag, status);
546 lastP = p;
547 }
548 }
549 // The loop normally exits by means of the break in the middle.
550 // Make sure that the index was at the correct position for the break iterator to have
551 // returned DONE.
552 if (i != td.fDataToBreak.length()) {
553 errln("testFollowing(): iterator returned DONE prematurely.");
554 }
555
556 // Full check of all results.
557 td.checkResults("testFollowing", this);
558 }
559
560
561
562 void RBBITest::testPreceding(RuleBasedBreakIterator& bi, BITestData &td) {
563 UErrorCode status = U_ZERO_ERROR;
564 int32_t p;
565 int32_t tag;
566 int32_t lastP = 0x7ffffffe;
567 int i;
568
569 logln("testPreceding():");
570 bi.setText(td.fDataToBreak);
571 td.clearResults();
572
573 p = bi.last();
574 td.fActualBreakPositions.addElement(p, status);
575 tag = bi.getRuleStatus();
576 td.fActualTags.addElement(tag, status);
577
578 for (i = td.fDataToBreak.length(); i>=-1; i--) {
579 p = bi.preceding(i);
580 if (p != lastP) {
581 if (p == RuleBasedBreakIterator::DONE) {
582 break;
583 }
584 // We've reached a new break position. Save it.
585 td.fActualBreakPositions.insertElementAt(p, 0, status);
586 lastP = p;
587 tag = bi.getRuleStatus();
588 td.fActualTags.insertElementAt(tag, 0, status);
589 }
590 }
591 // The loop normally exits by means of the break in the middle.
592 // Make sure that the index was at the correct position for the break iterator to have
593 // returned DONE.
594 if (i != 0) {
595 errln("testPreceding(): iterator returned DONE prematurely.");
596 }
597
598 // Full check of all results.
599 td.checkResults("testPreceding", this);
600 }
601
602
603
604 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td) {
605 UErrorCode status = U_ZERO_ERROR;
606 int i;
607 int32_t tag;
608
609 logln("testIsBoundary():");
610 bi.setText(td.fDataToBreak);
611 td.clearResults();
612
613 for (i = 0; i <= td.fDataToBreak.length(); i++) {
614 if (bi.isBoundary(i)) {
615 td.fActualBreakPositions.addElement(i, status); // Save result.
616 tag = bi.getRuleStatus();
617 td.fActualTags.addElement(tag, status);
618 }
619 }
620 td.checkResults("testIsBoundary: ", this);
621 }
622
623
624
625 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
626 {
627 iterator.setText(td.fDataToBreak);
628
629 RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
630 int32_t offset = iterator.first();
631 int32_t testOffset;
632 int32_t count = 0;
633
634 logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
635
636 if (*testIterator != iterator)
637 errln("clone() or operator!= failed: two clones compared unequal");
638
639 do {
640 testOffset = testIterator->first();
641 testOffset = testIterator->next(count);
642 if (offset != testOffset)
643 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
644
645 if (offset != RuleBasedBreakIterator::DONE) {
646 count++;
647 offset = iterator.next();
648
649 if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
650 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
651 if (count > 10000 || offset == -1) {
652 errln("operator== failed too many times. Stopping test.");
653 if (offset == -1) {
654 errln("Does (RuleBasedBreakIterator::DONE == -1)?");
655 }
656 return;
657 }
658 }
659 }
660 } while (offset != RuleBasedBreakIterator::DONE);
661
662 // now do it backwards...
663 offset = iterator.last();
664 count = 0;
665
666 do {
667 testOffset = testIterator->last();
668 testOffset = testIterator->next(count); // next() with a negative arg is same as previous
669 if (offset != testOffset)
670 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
671
672 if (offset != RuleBasedBreakIterator::DONE) {
673 count--;
674 offset = iterator.previous();
675 }
676 } while (offset != RuleBasedBreakIterator::DONE);
677
678 delete testIterator;
679 }
680
681
682 //---------------------------------------------
683 //
684 // other tests
685 //
686 //---------------------------------------------
687 void RBBITest::TestEmptyString()
688 {
689 UnicodeString text = "";
690 UErrorCode status = U_ZERO_ERROR;
691
692 BITestData x(status);
693 ADD_DATACHUNK(x, "", 0, status); // Break at start of data
694 RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
695 if (U_FAILURE(status))
696 {
697 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
698 return;
699 }
700 generalIteratorTest(*bi, x);
701 delete bi;
702 }
703
704 void RBBITest::TestGetAvailableLocales()
705 {
706 int32_t locCount = 0;
707 const Locale* locList = BreakIterator::getAvailableLocales(locCount);
708
709 if (locCount == 0)
710 dataerrln("getAvailableLocales() returned an empty list!");
711 // Just make sure that it's returning good memory.
712 int32_t i;
713 for (i = 0; i < locCount; ++i) {
714 logln(locList[i].getName());
715 }
716 }
717
718 //Testing the BreakIterator::getDisplayName() function
719 void RBBITest::TestGetDisplayName()
720 {
721 UnicodeString result;
722
723 BreakIterator::getDisplayName(Locale::getUS(), result);
724 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
725 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
726 + result);
727
728 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
729 if (result != "French (France)")
730 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
731 + result);
732 }
733 /**
734 * Test End Behaviour
735 * @bug 4068137
736 */
737 void RBBITest::TestEndBehaviour()
738 {
739 UErrorCode status = U_ZERO_ERROR;
740 UnicodeString testString("boo.");
741 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
742 if (U_FAILURE(status))
743 {
744 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
745 return;
746 }
747 wb->setText(testString);
748
749 if (wb->first() != 0)
750 errln("Didn't get break at beginning of string.");
751 if (wb->next() != 3)
752 errln("Didn't get break before period in \"boo.\"");
753 if (wb->current() != 4 && wb->next() != 4)
754 errln("Didn't get break at end of string.");
755 delete wb;
756 }
757 /*
758 * @bug 4153072
759 */
760 void RBBITest::TestBug4153072() {
761 UErrorCode status = U_ZERO_ERROR;
762 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
763 if (U_FAILURE(status))
764 {
765 errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
766 return;
767 }
768 UnicodeString str("...Hello, World!...");
769 int32_t begin = 3;
770 int32_t end = str.length() - 3;
771 UBool onBoundary;
772
773 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
774 iter->adoptText(textIterator);
775 int index;
776 // Note: with the switch to UText, there is no way to restrict the
777 // iteration range to begin at an index other than zero.
778 // String character iterators created with a non-zero bound are
779 // treated by RBBI as being empty.
780 for (index = -1; index < begin + 1; ++index) {
781 onBoundary = iter->isBoundary(index);
782 if (index == 0? !onBoundary : onBoundary) {
783 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
784 " and begin index = " + begin);
785 }
786 }
787 delete iter;
788 }
789
790
791 //
792 // Test for problem reported by Ashok Matoria on 9 July 2007
793 // One.<kSoftHyphen><kSpace>Two.
794 //
795 // Sentence break at start (0) and then on calling next() it breaks at
796 // 'T' of "Two". Now, at this point if I do next() and
797 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
798 //
799 void RBBITest::TestBug5775() {
800 UErrorCode status = U_ZERO_ERROR;
801 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
802 TEST_ASSERT_SUCCESS(status);
803 if (U_FAILURE(status)) {
804 return;
805 }
806 // Check for status first for better handling of no data errors.
807 TEST_ASSERT(bi != NULL);
808 if (bi == NULL) {
809 return;
810 }
811
812 UnicodeString s("One.\\u00ad Two.", -1, US_INV);
813 // 01234 56789
814 s = s.unescape();
815 bi->setText(s);
816 int pos = bi->next();
817 TEST_ASSERT(pos == 6);
818 pos = bi->next();
819 TEST_ASSERT(pos == 10);
820 pos = bi->previous();
821 TEST_ASSERT(pos == 6);
822 delete bi;
823 }
824
825
826
827 //------------------------------------------------------------------------------
828 //
829 // RBBITest::Extended Run RBBI Tests from an external test data file
830 //
831 //------------------------------------------------------------------------------
832
833 struct TestParams {
834 BreakIterator *bi;
835 UnicodeString dataToBreak;
836 UVector32 *expectedBreaks;
837 UVector32 *srcLine;
838 UVector32 *srcCol;
839 };
840
841 void RBBITest::executeTest(TestParams *t) {
842 int32_t bp;
843 int32_t prevBP;
844 int32_t i;
845
846 if (t->bi == NULL) {
847 return;
848 }
849
850 t->bi->setText(t->dataToBreak);
851 //
852 // Run the iterator forward
853 //
854 prevBP = -1;
855 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
856 if (prevBP == bp) {
857 // Fail for lack of forward progress.
858 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
859 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
860 break;
861 }
862
863 // Check that there were we didn't miss an expected break between the last one
864 // and this one.
865 for (i=prevBP+1; i<bp; i++) {
866 if (t->expectedBreaks->elementAti(i) != 0) {
867 int expected[] = {0, i};
868 printStringBreaks(t->dataToBreak, expected, 2);
869 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
870 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
871 }
872 }
873
874 // Check that the break we did find was expected
875 if (t->expectedBreaks->elementAti(bp) == 0) {
876 int expected[] = {0, bp};
877 printStringBreaks(t->dataToBreak, expected, 2);
878 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
879 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
880 } else {
881 // The break was expected.
882 // Check that the {nnn} tag value is correct.
883 int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
884 if (expectedTagVal == -1) {
885 expectedTagVal = 0;
886 }
887 int32_t line = t->srcLine->elementAti(bp);
888 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
889 if (rs != expectedTagVal) {
890 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
891 " Actual, Expected status = %4d, %4d",
892 bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
893 }
894 }
895
896
897 prevBP = bp;
898 }
899
900 // Verify that there were no missed expected breaks after the last one found
901 for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
902 if (t->expectedBreaks->elementAti(i) != 0) {
903 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
904 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
905 }
906 }
907
908 //
909 // Run the iterator backwards, verify that the same breaks are found.
910 //
911 prevBP = t->dataToBreak.length()+2; // start with a phony value for the last break pos seen.
912 for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
913 if (prevBP == bp) {
914 // Fail for lack of progress.
915 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
916 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
917 break;
918 }
919
920 // Check that there were we didn't miss an expected break between the last one
921 // and this one. (UVector returns zeros for index out of bounds.)
922 for (i=prevBP-1; i>bp; i--) {
923 if (t->expectedBreaks->elementAti(i) != 0) {
924 errln("Reverse Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
925 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
926 }
927 }
928
929 // Check that the break we did find was expected
930 if (t->expectedBreaks->elementAti(bp) == 0) {
931 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
932 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
933 } else {
934 // The break was expected.
935 // Check that the {nnn} tag value is correct.
936 int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
937 if (expectedTagVal == -1) {
938 expectedTagVal = 0;
939 }
940 int line = t->srcLine->elementAti(bp);
941 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
942 if (rs != expectedTagVal) {
943 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
944 " Actual, Expected status = %4d, %4d",
945 bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
946 }
947 }
948
949 prevBP = bp;
950 }
951
952 // Verify that there were no missed breaks prior to the last one found
953 for (i=prevBP-1; i>=0; i--) {
954 if (t->expectedBreaks->elementAti(i) != 0) {
955 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
956 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
957 }
958 }
959
960 // Check isBoundary()
961 for (i=0; i<t->expectedBreaks->size(); i++) {
962 UBool boundaryExpected = (t->expectedBreaks->elementAti(i) != 0);
963 UBool boundaryFound = t->bi->isBoundary(i);
964 if (boundaryExpected != boundaryFound) {
965 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
966 " Expected, Actual= %s, %s",
967 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i),
968 boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
969 }
970 }
971
972 // Check following()
973 for (i=0; i<t->expectedBreaks->size(); i++) {
974 int32_t actualBreak = t->bi->following(i);
975 int32_t expectedBreak = BreakIterator::DONE;
976 for (int32_t j=i+1; j < t->expectedBreaks->size(); j++) {
977 if (t->expectedBreaks->elementAti(j) != 0) {
978 expectedBreak = j;
979 break;
980 }
981 }
982 if (expectedBreak != actualBreak) {
983 errln("following(%d) incorrect. File line,col= %4d,%4d\n"
984 " Expected, Actual= %d, %d",
985 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i), expectedBreak, actualBreak);
986 }
987 }
988
989 // Check preceding()
990 for (i=t->expectedBreaks->size(); i>=0; i--) {
991 int32_t actualBreak = t->bi->preceding(i);
992 int32_t expectedBreak = BreakIterator::DONE;
993
994 for (int32_t j=i-1; j >= 0; j--) {
995 if (t->expectedBreaks->elementAti(j) != 0) {
996 expectedBreak = j;
997 break;
998 }
999 }
1000 if (expectedBreak != actualBreak) {
1001 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
1002 " Expected, Actual= %d, %d",
1003 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i), expectedBreak, actualBreak);
1004 }
1005 }
1006 }
1007
1008
1009 void RBBITest::TestExtended() {
1010 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1011 UErrorCode status = U_ZERO_ERROR;
1012 Locale locale("");
1013
1014 UnicodeString rules;
1015 TestParams tp;
1016 tp.bi = NULL;
1017 tp.expectedBreaks = new UVector32(status);
1018 tp.srcLine = new UVector32(status);
1019 tp.srcCol = new UVector32(status);
1020
1021 RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status);
1022 if (U_FAILURE(status)) {
1023 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
1024 }
1025
1026
1027 //
1028 // Open and read the test data file.
1029 //
1030 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1031 char testFileName[1000];
1032 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1033 errln("Can't open test data. Path too long.");
1034 return;
1035 }
1036 strcpy(testFileName, testDataDirectory);
1037 strcat(testFileName, "rbbitst.txt");
1038
1039 int len;
1040 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1041 if (U_FAILURE(status)) {
1042 return; /* something went wrong, error already output */
1043 }
1044
1045
1046
1047
1048 //
1049 // Put the test data into a UnicodeString
1050 //
1051 UnicodeString testString(FALSE, testFile, len);
1052
1053 enum EParseState{
1054 PARSE_COMMENT,
1055 PARSE_TAG,
1056 PARSE_DATA,
1057 PARSE_NUM
1058 }
1059 parseState = PARSE_TAG;
1060
1061 EParseState savedState = PARSE_TAG;
1062
1063 static const UChar CH_LF = 0x0a;
1064 static const UChar CH_CR = 0x0d;
1065 static const UChar CH_HASH = 0x23;
1066 /*static const UChar CH_PERIOD = 0x2e;*/
1067 static const UChar CH_LT = 0x3c;
1068 static const UChar CH_GT = 0x3e;
1069 static const UChar CH_BACKSLASH = 0x5c;
1070 static const UChar CH_BULLET = 0x2022;
1071
1072 int32_t lineNum = 1;
1073 int32_t colStart = 0;
1074 int32_t column = 0;
1075 int32_t charIdx = 0;
1076
1077 int32_t tagValue = 0; // The numeric value of a <nnn> tag.
1078
1079 for (charIdx = 0; charIdx < len; ) {
1080 status = U_ZERO_ERROR;
1081 UChar c = testString.charAt(charIdx);
1082 charIdx++;
1083 if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
1084 // treat CRLF as a unit
1085 c = CH_LF;
1086 charIdx++;
1087 }
1088 if (c == CH_LF || c == CH_CR) {
1089 lineNum++;
1090 colStart = charIdx;
1091 }
1092 column = charIdx - colStart + 1;
1093
1094 switch (parseState) {
1095 case PARSE_COMMENT:
1096 if (c == 0x0a || c == 0x0d) {
1097 parseState = savedState;
1098 }
1099 break;
1100
1101 case PARSE_TAG:
1102 {
1103 if (c == CH_HASH) {
1104 parseState = PARSE_COMMENT;
1105 savedState = PARSE_TAG;
1106 break;
1107 }
1108 if (u_isUWhiteSpace(c)) {
1109 break;
1110 }
1111 if (testString.compare(charIdx-1, 6, "<word>") == 0) {
1112 delete tp.bi;
1113 tp.bi = BreakIterator::createWordInstance(locale, status);
1114 charIdx += 5;
1115 break;
1116 }
1117 if (testString.compare(charIdx-1, 6, "<char>") == 0) {
1118 delete tp.bi;
1119 tp.bi = BreakIterator::createCharacterInstance(locale, status);
1120 charIdx += 5;
1121 break;
1122 }
1123 if (testString.compare(charIdx-1, 6, "<line>") == 0) {
1124 delete tp.bi;
1125 tp.bi = BreakIterator::createLineInstance(locale, status);
1126 charIdx += 5;
1127 break;
1128 }
1129 if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
1130 delete tp.bi;
1131 tp.bi = NULL;
1132 tp.bi = BreakIterator::createSentenceInstance(locale, status);
1133 charIdx += 5;
1134 break;
1135 }
1136 if (testString.compare(charIdx-1, 7, "<title>") == 0) {
1137 delete tp.bi;
1138 tp.bi = BreakIterator::createTitleInstance(locale, status);
1139 charIdx += 6;
1140 break;
1141 }
1142
1143 // <locale loc_name>
1144 localeMatcher.reset(testString);
1145 if (localeMatcher.lookingAt(charIdx-1, status)) {
1146 UnicodeString localeName = localeMatcher.group(1, status);
1147 char localeName8[100];
1148 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
1149 locale = Locale::createFromName(localeName8);
1150 charIdx += localeMatcher.group(0, status).length() - 1;
1151 TEST_ASSERT_SUCCESS(status);
1152 break;
1153 }
1154 if (testString.compare(charIdx-1, 6, "<data>") == 0) {
1155 parseState = PARSE_DATA;
1156 charIdx += 5;
1157 tp.dataToBreak = "";
1158 tp.expectedBreaks->removeAllElements();
1159 tp.srcCol ->removeAllElements();
1160 tp.srcLine->removeAllElements();
1161 break;
1162 }
1163
1164 errln("line %d: Tag expected in test file.", lineNum);
1165 parseState = PARSE_COMMENT;
1166 savedState = PARSE_DATA;
1167 goto end_test; // Stop the test.
1168 }
1169 break;
1170
1171 case PARSE_DATA:
1172 if (c == CH_BULLET) {
1173 int32_t breakIdx = tp.dataToBreak.length();
1174 tp.expectedBreaks->setSize(breakIdx+1);
1175 tp.expectedBreaks->setElementAt(-1, breakIdx);
1176 tp.srcLine->setSize(breakIdx+1);
1177 tp.srcLine->setElementAt(lineNum, breakIdx);
1178 tp.srcCol ->setSize(breakIdx+1);
1179 tp.srcCol ->setElementAt(column, breakIdx);
1180 break;
1181 }
1182
1183 if (testString.compare(charIdx-1, 7, "</data>") == 0) {
1184 // Add final entry to mappings from break location to source file position.
1185 // Need one extra because last break position returned is after the
1186 // last char in the data, not at the last char.
1187 tp.srcLine->addElement(lineNum, status);
1188 tp.srcCol ->addElement(column, status);
1189
1190 parseState = PARSE_TAG;
1191 charIdx += 6;
1192
1193 // RUN THE TEST!
1194 executeTest(&tp);
1195 break;
1196 }
1197
1198 if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
1199 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1200 // Get the code point from the name and insert it into the test data.
1201 // (Damn, no API takes names in Unicode !!!
1202 // we've got to take it back to char *)
1203 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
1204 int32_t nameLength = nameEndIdx - (charIdx+2);
1205 char charNameBuf[200];
1206 UChar32 theChar = -1;
1207 if (nameEndIdx != -1) {
1208 UErrorCode status = U_ZERO_ERROR;
1209 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1210 charNameBuf[sizeof(charNameBuf)-1] = 0;
1211 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1212 if (U_FAILURE(status)) {
1213 theChar = -1;
1214 }
1215 }
1216 if (theChar == -1) {
1217 errln("Error in named character in test file at line %d, col %d",
1218 lineNum, column);
1219 } else {
1220 // Named code point was recognized. Insert it
1221 // into the test data.
1222 tp.dataToBreak.append(theChar);
1223 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1224 tp.srcLine->addElement(lineNum, status);
1225 tp.srcCol ->addElement(column, status);
1226 }
1227 }
1228 if (nameEndIdx > charIdx) {
1229 charIdx = nameEndIdx+1;
1230
1231 }
1232 break;
1233 }
1234
1235
1236
1237
1238 if (testString.compare(charIdx-1, 2, "<>") == 0) {
1239 charIdx++;
1240 int32_t breakIdx = tp.dataToBreak.length();
1241 tp.expectedBreaks->setSize(breakIdx+1);
1242 tp.expectedBreaks->setElementAt(-1, breakIdx);
1243 tp.srcLine->setSize(breakIdx+1);
1244 tp.srcLine->setElementAt(lineNum, breakIdx);
1245 tp.srcCol ->setSize(breakIdx+1);
1246 tp.srcCol ->setElementAt(column, breakIdx);
1247 break;
1248 }
1249
1250 if (c == CH_LT) {
1251 tagValue = 0;
1252 parseState = PARSE_NUM;
1253 break;
1254 }
1255
1256 if (c == CH_HASH && column==3) { // TODO: why is column off so far?
1257 parseState = PARSE_COMMENT;
1258 savedState = PARSE_DATA;
1259 break;
1260 }
1261
1262 if (c == CH_BACKSLASH) {
1263 // Check for \ at end of line, a line continuation.
1264 // Advance over (discard) the newline
1265 UChar32 cp = testString.char32At(charIdx);
1266 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
1267 // We have a CR LF
1268 // Need an extra increment of the input ptr to move over both of them
1269 charIdx++;
1270 }
1271 if (cp == CH_LF || cp == CH_CR) {
1272 lineNum++;
1273 colStart = charIdx;
1274 charIdx++;
1275 break;
1276 }
1277
1278 // Let unescape handle the back slash.
1279 cp = testString.unescapeAt(charIdx);
1280 if (cp != -1) {
1281 // Escape sequence was recognized. Insert the char
1282 // into the test data.
1283 tp.dataToBreak.append(cp);
1284 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1285 tp.srcLine->addElement(lineNum, status);
1286 tp.srcCol ->addElement(column, status);
1287 }
1288 break;
1289 }
1290
1291
1292 // Not a recognized backslash escape sequence.
1293 // Take the next char as a literal.
1294 // TODO: Should this be an error?
1295 c = testString.charAt(charIdx);
1296 charIdx = testString.moveIndex32(charIdx, 1);
1297 }
1298
1299 // Normal, non-escaped data char.
1300 tp.dataToBreak.append(c);
1301
1302 // Save the mapping from offset in the data to line/column numbers in
1303 // the original input file. Will be used for better error messages only.
1304 // If there's an expected break before this char, the slot in the mapping
1305 // vector will already be set for this char; don't overwrite it.
1306 if (tp.dataToBreak.length() > tp.srcLine->size()) {
1307 tp.srcLine->addElement(lineNum, status);
1308 tp.srcCol ->addElement(column, status);
1309 }
1310 break;
1311
1312
1313 case PARSE_NUM:
1314 // We are parsing an expected numeric tag value, like <1234>,
1315 // within a chunk of data.
1316 if (u_isUWhiteSpace(c)) {
1317 break;
1318 }
1319
1320 if (c == CH_GT) {
1321 // Finished the number. Add the info to the expected break data,
1322 // and switch parse state back to doing plain data.
1323 parseState = PARSE_DATA;
1324 if (tagValue == 0) {
1325 tagValue = -1;
1326 }
1327 int32_t breakIdx = tp.dataToBreak.length();
1328 tp.expectedBreaks->setSize(breakIdx+1);
1329 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1330 tp.srcLine->setSize(breakIdx+1);
1331 tp.srcLine->setElementAt(lineNum, breakIdx);
1332 tp.srcCol ->setSize(breakIdx+1);
1333 tp.srcCol ->setElementAt(column, breakIdx);
1334 break;
1335 }
1336
1337 if (u_isdigit(c)) {
1338 tagValue = tagValue*10 + u_charDigitValue(c);
1339 break;
1340 }
1341
1342 errln("Syntax Error in test file at line %d, col %d",
1343 lineNum, column);
1344 parseState = PARSE_COMMENT;
1345 goto end_test; // Stop the test
1346 break;
1347 }
1348
1349
1350 if (U_FAILURE(status)) {
1351 dataerrln("ICU Error %s while parsing test file at line %d.",
1352 u_errorName(status), lineNum);
1353 status = U_ZERO_ERROR;
1354 goto end_test; // Stop the test
1355 }
1356
1357 }
1358
1359 end_test:
1360 delete tp.bi;
1361 delete tp.expectedBreaks;
1362 delete tp.srcLine;
1363 delete tp.srcCol;
1364 delete [] testFile;
1365 #endif
1366 }
1367
1368
1369 //-------------------------------------------------------------------------------
1370 //
1371 // TestDictRules create a break iterator from source rules that includes a
1372 // dictionary range. Regression for bug #7130. Source rules
1373 // do not declare a break iterator type (word, line, sentence, etc.
1374 // but the dictionary code, without a type, would loop.
1375 //
1376 //-------------------------------------------------------------------------------
1377 void RBBITest::TestDictRules() {
1378 const char *rules = "$dictionary = [a-z]; \n"
1379 "!!forward; \n"
1380 "$dictionary $dictionary; \n"
1381 "!!reverse; \n"
1382 "$dictionary $dictionary; \n";
1383 const char *text = "aa";
1384 UErrorCode status = U_ZERO_ERROR;
1385 UParseError parseError;
1386
1387 RuleBasedBreakIterator bi(rules, parseError, status);
1388 if (U_SUCCESS(status)) {
1389 UnicodeString utext = text;
1390 bi.setText(utext);
1391 int32_t position;
1392 int32_t loops;
1393 for (loops = 0; loops<10; loops++) {
1394 position = bi.next();
1395 if (position == RuleBasedBreakIterator::DONE) {
1396 break;
1397 }
1398 }
1399 TEST_ASSERT(loops == 1);
1400 } else {
1401 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1402 }
1403 }
1404
1405
1406
1407 //-------------------------------------------------------------------------------
1408 //
1409 // ReadAndConvertFile Read a text data file, convert it to UChars, and
1410 // return the datain one big UChar * buffer, which the caller must delete.
1411 //
1412 // parameters:
1413 // fileName: the name of the file, with no directory part. The test data directory
1414 // is assumed.
1415 // ulen an out parameter, receives the actual length (in UChars) of the file data.
1416 // encoding The file encoding. If the file contains a BOM, that will override the encoding
1417 // specified here. The BOM, if it exists, will be stripped from the returned data.
1418 // Pass NULL for the system default encoding.
1419 // status
1420 // returns:
1421 // The file data, converted to UChar.
1422 // The caller must delete this when done with
1423 // delete [] theBuffer;
1424 //
1425 // TODO: This is a clone of RegexTest::ReadAndConvertFile.
1426 // Move this function to some common place.
1427 //
1428 //--------------------------------------------------------------------------------
1429 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
1430 UChar *retPtr = NULL;
1431 char *fileBuf = NULL;
1432 UConverter* conv = NULL;
1433 FILE *f = NULL;
1434
1435 ulen = 0;
1436 if (U_FAILURE(status)) {
1437 return retPtr;
1438 }
1439
1440 //
1441 // Open the file.
1442 //
1443 f = fopen(fileName, "rb");
1444 if (f == 0) {
1445 dataerrln("Error opening test data file %s\n", fileName);
1446 status = U_FILE_ACCESS_ERROR;
1447 return NULL;
1448 }
1449 //
1450 // Read it in
1451 //
1452 int fileSize;
1453 int amt_read;
1454
1455 fseek( f, 0, SEEK_END);
1456 fileSize = ftell(f);
1457 fileBuf = new char[fileSize];
1458 fseek(f, 0, SEEK_SET);
1459 amt_read = fread(fileBuf, 1, fileSize, f);
1460 if (amt_read != fileSize || fileSize <= 0) {
1461 errln("Error reading test data file.");
1462 goto cleanUpAndReturn;
1463 }
1464
1465 //
1466 // Look for a Unicode Signature (BOM) on the data just read
1467 //
1468 int32_t signatureLength;
1469 const char * fileBufC;
1470 const char* bomEncoding;
1471
1472 fileBufC = fileBuf;
1473 bomEncoding = ucnv_detectUnicodeSignature(
1474 fileBuf, fileSize, &signatureLength, &status);
1475 if(bomEncoding!=NULL ){
1476 fileBufC += signatureLength;
1477 fileSize -= signatureLength;
1478 encoding = bomEncoding;
1479 }
1480
1481 //
1482 // Open a converter to take the rule file to UTF-16
1483 //
1484 conv = ucnv_open(encoding, &status);
1485 if (U_FAILURE(status)) {
1486 goto cleanUpAndReturn;
1487 }
1488
1489 //
1490 // Convert the rules to UChar.
1491 // Preflight first to determine required buffer size.
1492 //
1493 ulen = ucnv_toUChars(conv,
1494 NULL, // dest,
1495 0, // destCapacity,
1496 fileBufC,
1497 fileSize,
1498 &status);
1499 if (status == U_BUFFER_OVERFLOW_ERROR) {
1500 // Buffer Overflow is expected from the preflight operation.
1501 status = U_ZERO_ERROR;
1502
1503 retPtr = new UChar[ulen+1];
1504 ucnv_toUChars(conv,
1505 retPtr, // dest,
1506 ulen+1,
1507 fileBufC,
1508 fileSize,
1509 &status);
1510 }
1511
1512 cleanUpAndReturn:
1513 fclose(f);
1514 delete []fileBuf;
1515 ucnv_close(conv);
1516 if (U_FAILURE(status)) {
1517 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1518 delete []retPtr;
1519 retPtr = 0;
1520 ulen = 0;
1521 };
1522 return retPtr;
1523 }
1524
1525
1526
1527 //--------------------------------------------------------------------------------------------
1528 //
1529 // Run tests from each of the boundary test data files distributed by the Unicode Consortium
1530 //
1531 //-------------------------------------------------------------------------------------------
1532 void RBBITest::TestUnicodeFiles() {
1533 RuleBasedBreakIterator *bi;
1534 UErrorCode status = U_ZERO_ERROR;
1535
1536 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1537 TEST_ASSERT_SUCCESS(status);
1538 if (U_SUCCESS(status)) {
1539 runUnicodeTestData("GraphemeBreakTest.txt", bi);
1540 }
1541 delete bi;
1542
1543 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1544 TEST_ASSERT_SUCCESS(status);
1545 if (U_SUCCESS(status)) {
1546 runUnicodeTestData("WordBreakTest.txt", bi);
1547 }
1548 delete bi;
1549
1550 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1551 TEST_ASSERT_SUCCESS(status);
1552 if (U_SUCCESS(status)) {
1553 runUnicodeTestData("SentenceBreakTest.txt", bi);
1554 }
1555 delete bi;
1556
1557 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1558 TEST_ASSERT_SUCCESS(status);
1559 if (U_SUCCESS(status)) {
1560 runUnicodeTestData("LineBreakTest.txt", bi);
1561 }
1562 delete bi;
1563 }
1564
1565
1566 //--------------------------------------------------------------------------------------------
1567 //
1568 // Run tests from one of the boundary test data files distributed by the Unicode Consortium
1569 //
1570 //-------------------------------------------------------------------------------------------
1571 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1572 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1573 // TODO(andy): Match line break behavior to Unicode 6.0 and remove this time bomb. Ticket #7270
1574 UBool isTicket7270Fixed = isICUVersionAtLeast(52, 1);
1575 UBool isLineBreak = 0 == strcmp(fileName, "LineBreakTest.txt");
1576 UErrorCode status = U_ZERO_ERROR;
1577
1578 //
1579 // Open and read the test data file, put it into a UnicodeString.
1580 //
1581 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1582 char testFileName[1000];
1583 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1584 dataerrln("Can't open test data. Path too long.");
1585 return;
1586 }
1587 strcpy(testFileName, testDataDirectory);
1588 strcat(testFileName, fileName);
1589
1590 logln("Opening data file %s\n", fileName);
1591
1592 int len;
1593 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1594 if (status != U_FILE_ACCESS_ERROR) {
1595 TEST_ASSERT_SUCCESS(status);
1596 TEST_ASSERT(testFile != NULL);
1597 }
1598 if (U_FAILURE(status) || testFile == NULL) {
1599 return; /* something went wrong, error already output */
1600 }
1601 UnicodeString testFileAsString(TRUE, testFile, len);
1602
1603 //
1604 // Parse the test data file using a regular expression.
1605 // Each kind of token is recognized in its own capture group; what type of item was scanned
1606 // is identified by which group had a match.
1607 //
1608 // Caputure Group # 1 2 3 4 5
1609 // Parses this item: divide x hex digits comment \n unrecognized \n
1610 //
1611 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1612 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1613 UnicodeString testString;
1614 UVector32 breakPositions(status);
1615 int lineNumber = 1;
1616 TEST_ASSERT_SUCCESS(status);
1617 if (U_FAILURE(status)) {
1618 return;
1619 }
1620
1621 //
1622 // Scan through each test case, building up the string to be broken in testString,
1623 // and the positions that should be boundaries in the breakPositions vector.
1624 //
1625 int spin = 0;
1626 while (tokenMatcher.find()) {
1627 if(tokenMatcher.hitEnd()) {
1628 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for.
1629 This occurred when the text file was corrupt (wasn't marked as UTF-8)
1630 and caused an infinite loop here on EBCDIC systems!
1631 */
1632 fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1633 // return;
1634 }
1635 if (tokenMatcher.start(1, status) >= 0) {
1636 // Scanned a divide sign, indicating a break position in the test data.
1637 if (testString.length()>0) {
1638 breakPositions.addElement(testString.length(), status);
1639 }
1640 }
1641 else if (tokenMatcher.start(2, status) >= 0) {
1642 // Scanned an 'x', meaning no break at this position in the test data
1643 // Nothing to be done here.
1644 }
1645 else if (tokenMatcher.start(3, status) >= 0) {
1646 // Scanned Hex digits. Convert them to binary, append to the character data string.
1647 const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1648 int length = hexNumber.length();
1649 if (length<=8) {
1650 char buf[10];
1651 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1652 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1653 if (c<=0x10ffff) {
1654 testString.append(c);
1655 } else {
1656 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1657 fileName, lineNumber);
1658 }
1659 } else {
1660 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1661 fileName, lineNumber);
1662 }
1663 }
1664 else if (tokenMatcher.start(4, status) >= 0) {
1665 // Scanned to end of a line, possibly skipping over a comment in the process.
1666 // If the line from the file contained test data, run the test now.
1667 //
1668 if (testString.length() > 0) {
1669 // TODO(andy): Remove this time bomb code. Note: Failing line numbers may change when updating to new Unicode data.
1670 // Rule 8
1671 // ZW SP* <break>
1672 // is not yet implemented.
1673 if (!(isLineBreak && !isTicket7270Fixed && (5198 == lineNumber ||
1674 5202 == lineNumber ||
1675 5214 == lineNumber ||
1676 5246 == lineNumber ||
1677 5298 == lineNumber ||
1678 5302 == lineNumber ))) {
1679 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1680 }
1681 }
1682
1683 // Clear out this test case.
1684 // The string and breakPositions vector will be refilled as the next
1685 // test case is parsed.
1686 testString.remove();
1687 breakPositions.removeAllElements();
1688 lineNumber++;
1689 } else {
1690 // Scanner catchall. Something unrecognized appeared on the line.
1691 char token[16];
1692 UnicodeString uToken = tokenMatcher.group(0, status);
1693 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1694 token[sizeof(token)-1] = 0;
1695 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token);
1696
1697 // Clean up, in preparation for continuing with the next line.
1698 testString.remove();
1699 breakPositions.removeAllElements();
1700 lineNumber++;
1701 }
1702 TEST_ASSERT_SUCCESS(status);
1703 if (U_FAILURE(status)) {
1704 break;
1705 }
1706 }
1707
1708 delete [] testFile;
1709 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
1710 }
1711
1712 //--------------------------------------------------------------------------------------------
1713 //
1714 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium
1715 // test data files. Do only a simple, forward-only check -
1716 // this test is mostly to check that ICU and the Unicode
1717 // data agree with each other.
1718 //
1719 //--------------------------------------------------------------------------------------------
1720 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1721 const UnicodeString &testString, // Text data to be broken
1722 UVector32 *breakPositions, // Positions where breaks should be found.
1723 RuleBasedBreakIterator *bi) {
1724 int32_t pos; // Break Position in the test string
1725 int32_t expectedI = 0; // Index of expected break position in the vector of expected results.
1726 int32_t expectedPos; // Expected break position (index into test string)
1727
1728 bi->setText(testString);
1729 pos = bi->first();
1730 pos = bi->next();
1731
1732 while (pos != BreakIterator::DONE) {
1733 if (expectedI >= breakPositions->size()) {
1734 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1735 testFileName, lineNumber, pos);
1736 break;
1737 }
1738 expectedPos = breakPositions->elementAti(expectedI);
1739 if (pos < expectedPos) {
1740 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1741 testFileName, lineNumber, pos);
1742 break;
1743 }
1744 if (pos > expectedPos) {
1745 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1746 testFileName, lineNumber, expectedPos);
1747 break;
1748 }
1749 pos = bi->next();
1750 expectedI++;
1751 }
1752
1753 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1754 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1755 testFileName, lineNumber, breakPositions->elementAti(expectedI));
1756 }
1757 }
1758
1759
1760
1761 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1762 //---------------------------------------------------------------------------------------
1763 //
1764 // classs RBBIMonkeyKind
1765 //
1766 // Monkey Test for Break Iteration
1767 // Abstract interface class. Concrete derived classes independently
1768 // implement the break rules for different iterator types.
1769 //
1770 // The Monkey Test itself uses doesn't know which type of break iterator it is
1771 // testing, but works purely in terms of the interface defined here.
1772 //
1773 //---------------------------------------------------------------------------------------
1774 class RBBIMonkeyKind {
1775 public:
1776 // Return a UVector of UnicodeSets, representing the character classes used
1777 // for this type of iterator.
1778 virtual UVector *charClasses() = 0;
1779
1780 // Set the test text on which subsequent calls to next() will operate
1781 virtual void setText(const UnicodeString &s) = 0;
1782
1783 // Find the next break postion, starting from the prev break position, or from zero.
1784 // Return -1 after reaching end of string.
1785 virtual int32_t next(int32_t i) = 0;
1786
1787 virtual ~RBBIMonkeyKind();
1788 UErrorCode deferredStatus;
1789
1790
1791 protected:
1792 RBBIMonkeyKind();
1793
1794 private:
1795 };
1796
1797 RBBIMonkeyKind::RBBIMonkeyKind() {
1798 deferredStatus = U_ZERO_ERROR;
1799 }
1800
1801 RBBIMonkeyKind::~RBBIMonkeyKind() {
1802 }
1803
1804
1805 //----------------------------------------------------------------------------------------
1806 //
1807 // Random Numbers. Similar to standard lib rand() and srand()
1808 // Not using library to
1809 // 1. Get same results on all platforms.
1810 // 2. Get access to current seed, to more easily reproduce failures.
1811 //
1812 //---------------------------------------------------------------------------------------
1813 static uint32_t m_seed = 1;
1814
1815 static uint32_t m_rand()
1816 {
1817 m_seed = m_seed * 1103515245 + 12345;
1818 return (uint32_t)(m_seed/65536) % 32768;
1819 }
1820
1821
1822 //------------------------------------------------------------------------------------------
1823 //
1824 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation
1825 // of RBBIMonkeyKind.
1826 //
1827 //------------------------------------------------------------------------------------------
1828 class RBBICharMonkey: public RBBIMonkeyKind {
1829 public:
1830 RBBICharMonkey();
1831 virtual ~RBBICharMonkey();
1832 virtual UVector *charClasses();
1833 virtual void setText(const UnicodeString &s);
1834 virtual int32_t next(int32_t i);
1835 private:
1836 UVector *fSets;
1837
1838 UnicodeSet *fCRLFSet;
1839 UnicodeSet *fControlSet;
1840 UnicodeSet *fExtendSet;
1841 UnicodeSet *fRegionalIndicatorSet;
1842 UnicodeSet *fPrependSet;
1843 UnicodeSet *fSpacingSet;
1844 UnicodeSet *fLSet;
1845 UnicodeSet *fVSet;
1846 UnicodeSet *fTSet;
1847 UnicodeSet *fLVSet;
1848 UnicodeSet *fLVTSet;
1849 UnicodeSet *fHangulSet;
1850 UnicodeSet *fAnySet;
1851
1852 const UnicodeString *fText;
1853 };
1854
1855
1856 RBBICharMonkey::RBBICharMonkey() {
1857 UErrorCode status = U_ZERO_ERROR;
1858
1859 fText = NULL;
1860
1861 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
1862 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status);
1863 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status);
1864 fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
1865 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
1866 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
1867 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
1868 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
1869 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
1870 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
1871 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
1872 fHangulSet = new UnicodeSet();
1873 fHangulSet->addAll(*fLSet);
1874 fHangulSet->addAll(*fVSet);
1875 fHangulSet->addAll(*fTSet);
1876 fHangulSet->addAll(*fLVSet);
1877 fHangulSet->addAll(*fLVTSet);
1878 fAnySet = new UnicodeSet(0, 0x10ffff);
1879
1880 fSets = new UVector(status);
1881 fSets->addElement(fCRLFSet, status);
1882 fSets->addElement(fControlSet, status);
1883 fSets->addElement(fExtendSet, status);
1884 fSets->addElement(fRegionalIndicatorSet, status);
1885 if (!fPrependSet->isEmpty()) {
1886 fSets->addElement(fPrependSet, status);
1887 }
1888 fSets->addElement(fSpacingSet, status);
1889 fSets->addElement(fHangulSet, status);
1890 fSets->addElement(fAnySet, status);
1891 if (U_FAILURE(status)) {
1892 deferredStatus = status;
1893 }
1894 }
1895
1896
1897 void RBBICharMonkey::setText(const UnicodeString &s) {
1898 fText = &s;
1899 }
1900
1901
1902
1903 int32_t RBBICharMonkey::next(int32_t prevPos) {
1904 int p0, p1, p2, p3; // Indices of the significant code points around the
1905 // break position being tested. The candidate break
1906 // location is before p2.
1907
1908 int breakPos = -1;
1909
1910 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
1911
1912 if (U_FAILURE(deferredStatus)) {
1913 return -1;
1914 }
1915
1916 // Previous break at end of string. return DONE.
1917 if (prevPos >= fText->length()) {
1918 return -1;
1919 }
1920 p0 = p1 = p2 = p3 = prevPos;
1921 c3 = fText->char32At(prevPos);
1922 c0 = c1 = c2 = 0;
1923
1924 // Loop runs once per "significant" character position in the input text.
1925 for (;;) {
1926 // Move all of the positions forward in the input string.
1927 p0 = p1; c0 = c1;
1928 p1 = p2; c1 = c2;
1929 p2 = p3; c2 = c3;
1930
1931 // Advancd p3 by one codepoint
1932 p3 = fText->moveIndex32(p3, 1);
1933 c3 = fText->char32At(p3);
1934
1935 if (p1 == p2) {
1936 // Still warming up the loop. (won't work with zero length strings, but we don't care)
1937 continue;
1938 }
1939 if (p2 == fText->length()) {
1940 // Reached end of string. Always a break position.
1941 break;
1942 }
1943
1944 // Rule GB3 CR x LF
1945 // No Extend or Format characters may appear between the CR and LF,
1946 // which requires the additional check for p2 immediately following p1.
1947 //
1948 if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
1949 continue;
1950 }
1951
1952 // Rule (GB4). ( Control | CR | LF ) <break>
1953 if (fControlSet->contains(c1) ||
1954 c1 == 0x0D ||
1955 c1 == 0x0A) {
1956 break;
1957 }
1958
1959 // Rule (GB5) <break> ( Control | CR | LF )
1960 //
1961 if (fControlSet->contains(c2) ||
1962 c2 == 0x0D ||
1963 c2 == 0x0A) {
1964 break;
1965 }
1966
1967
1968 // Rule (GB6) L x ( L | V | LV | LVT )
1969 if (fLSet->contains(c1) &&
1970 (fLSet->contains(c2) ||
1971 fVSet->contains(c2) ||
1972 fLVSet->contains(c2) ||
1973 fLVTSet->contains(c2))) {
1974 continue;
1975 }
1976
1977 // Rule (GB7) ( LV | V ) x ( V | T )
1978 if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
1979 (fVSet->contains(c2) || fTSet->contains(c2))) {
1980 continue;
1981 }
1982
1983 // Rule (GB8) ( LVT | T) x T
1984 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
1985 fTSet->contains(c2)) {
1986 continue;
1987 }
1988
1989 // Just adding extra Apple rule does here not work, behavior depends on arbitrary context
1990
1991 // Rule (GB8a) Regional_Indicator x Regional_Indicator
1992 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
1993 continue;
1994 }
1995
1996 // Rule (GB9) Numeric x ALetter
1997 if (fExtendSet->contains(c2)) {
1998 continue;
1999 }
2000
2001 // Rule (GB9a) x SpacingMark
2002 if (fSpacingSet->contains(c2)) {
2003 continue;
2004 }
2005
2006 // Rule (GB9b) Prepend x
2007 if (fPrependSet->contains(c1)) {
2008 continue;
2009 }
2010
2011 // Rule (GB10) Any <break> Any
2012 break;
2013 }
2014
2015 breakPos = p2;
2016 return breakPos;
2017 }
2018
2019
2020
2021 UVector *RBBICharMonkey::charClasses() {
2022 return fSets;
2023 }
2024
2025
2026 RBBICharMonkey::~RBBICharMonkey() {
2027 delete fSets;
2028 delete fCRLFSet;
2029 delete fControlSet;
2030 delete fExtendSet;
2031 delete fRegionalIndicatorSet;
2032 delete fPrependSet;
2033 delete fSpacingSet;
2034 delete fLSet;
2035 delete fVSet;
2036 delete fTSet;
2037 delete fLVSet;
2038 delete fLVTSet;
2039 delete fHangulSet;
2040 delete fAnySet;
2041 }
2042
2043 //------------------------------------------------------------------------------------------
2044 //
2045 // class RBBIWordMonkey Word Break specific implementation
2046 // of RBBIMonkeyKind.
2047 //
2048 //------------------------------------------------------------------------------------------
2049 class RBBIWordMonkey: public RBBIMonkeyKind {
2050 public:
2051 RBBIWordMonkey();
2052 virtual ~RBBIWordMonkey();
2053 virtual UVector *charClasses();
2054 virtual void setText(const UnicodeString &s);
2055 virtual int32_t next(int32_t i);
2056 private:
2057 UVector *fSets;
2058
2059 UnicodeSet *fCRSet;
2060 UnicodeSet *fLFSet;
2061 UnicodeSet *fNewlineSet;
2062 UnicodeSet *fKatakanaSet;
2063 UnicodeSet *fALetterSet;
2064 // TODO(jungshik): Do we still need this change?
2065 // UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt
2066 UnicodeSet *fMidNumLetSet;
2067 UnicodeSet *fMidLetterSet;
2068 UnicodeSet *fMidNumSet;
2069 UnicodeSet *fNumericSet;
2070 UnicodeSet *fFormatSet;
2071 UnicodeSet *fOtherSet;
2072 UnicodeSet *fExtendSet;
2073 UnicodeSet *fExtendNumLetSet;
2074 UnicodeSet *fRegionalIndicatorSet;
2075 UnicodeSet *fDictionaryCjkSet;
2076
2077 RegexMatcher *fMatcher;
2078
2079 const UnicodeString *fText;
2080 };
2081
2082
2083 RBBIWordMonkey::RBBIWordMonkey()
2084 {
2085 UErrorCode status = U_ZERO_ERROR;
2086
2087 fSets = new UVector(status);
2088
2089 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status);
2090 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status);
2091 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status);
2092 fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status);
2093 // Exclude Hangul syllables from ALetterSet during testing.
2094 // Leave CJK dictionary characters out from the monkey tests!
2095 #if 0
2096 fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}"
2097 "[\\p{Line_Break = Complex_Context}"
2098 "-\\p{Grapheme_Cluster_Break = Extend}"
2099 "-\\p{Grapheme_Cluster_Break = Control}"
2100 "]]",
2101 status);
2102 #endif
2103 fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
2104 fALetterSet->removeAll(*fDictionaryCjkSet);
2105 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status);
2106 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status);
2107 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status);
2108 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status);
2109 // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test
2110 // we should figure out why
2111 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status);
2112 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status);
2113 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
2114 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status);
2115 fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status);
2116
2117 fOtherSet = new UnicodeSet();
2118 if(U_FAILURE(status)) {
2119 deferredStatus = status;
2120 return;
2121 }
2122
2123 fOtherSet->complement();
2124 fOtherSet->removeAll(*fCRSet);
2125 fOtherSet->removeAll(*fLFSet);
2126 fOtherSet->removeAll(*fNewlineSet);
2127 fOtherSet->removeAll(*fKatakanaSet);
2128 fOtherSet->removeAll(*fALetterSet);
2129 fOtherSet->removeAll(*fMidLetterSet);
2130 fOtherSet->removeAll(*fMidNumSet);
2131 fOtherSet->removeAll(*fNumericSet);
2132 fOtherSet->removeAll(*fExtendNumLetSet);
2133 fOtherSet->removeAll(*fFormatSet);
2134 fOtherSet->removeAll(*fExtendSet);
2135 fOtherSet->removeAll(*fRegionalIndicatorSet);
2136 // Inhibit dictionary characters from being tested at all.
2137 fOtherSet->removeAll(*fDictionaryCjkSet);
2138 fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
2139
2140 fSets->addElement(fCRSet, status);
2141 fSets->addElement(fLFSet, status);
2142 fSets->addElement(fNewlineSet, status);
2143 fSets->addElement(fALetterSet, status);
2144 //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test katakana
2145 fSets->addElement(fMidLetterSet, status);
2146 fSets->addElement(fMidNumLetSet, status);
2147 fSets->addElement(fMidNumSet, status);
2148 fSets->addElement(fNumericSet, status);
2149 fSets->addElement(fFormatSet, status);
2150 fSets->addElement(fExtendSet, status);
2151 fSets->addElement(fOtherSet, status);
2152 fSets->addElement(fExtendNumLetSet, status);
2153 fSets->addElement(fRegionalIndicatorSet, status);
2154
2155 if (U_FAILURE(status)) {
2156 deferredStatus = status;
2157 }
2158 }
2159
2160 void RBBIWordMonkey::setText(const UnicodeString &s) {
2161 fText = &s;
2162 }
2163
2164
2165 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2166 int p0, p1, p2, p3; // Indices of the significant code points around the
2167 // break position being tested. The candidate break
2168 // location is before p2.
2169
2170 int breakPos = -1;
2171
2172 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2173
2174 if (U_FAILURE(deferredStatus)) {
2175 return -1;
2176 }
2177
2178 // Prev break at end of string. return DONE.
2179 if (prevPos >= fText->length()) {
2180 return -1;
2181 }
2182 p0 = p1 = p2 = p3 = prevPos;
2183 c3 = fText->char32At(prevPos);
2184 c0 = c1 = c2 = 0;
2185
2186 // Loop runs once per "significant" character position in the input text.
2187 for (;;) {
2188 // Move all of the positions forward in the input string.
2189 p0 = p1; c0 = c1;
2190 p1 = p2; c1 = c2;
2191 p2 = p3; c2 = c3;
2192
2193 // Advancd p3 by X(Extend | Format)* Rule 4
2194 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2195 do {
2196 p3 = fText->moveIndex32(p3, 1);
2197 c3 = fText->char32At(p3);
2198 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2199 break;
2200 };
2201 }
2202 while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
2203
2204
2205 if (p1 == p2) {
2206 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2207 continue;
2208 }
2209 if (p2 == fText->length()) {
2210 // Reached end of string. Always a break position.
2211 break;
2212 }
2213
2214 // Rule (3) CR x LF
2215 // No Extend or Format characters may appear between the CR and LF,
2216 // which requires the additional check for p2 immediately following p1.
2217 //
2218 if (c1==0x0D && c2==0x0A) {
2219 continue;
2220 }
2221
2222 // Rule (3a) Break before and after newlines (including CR and LF)
2223 //
2224 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2225 break;
2226 };
2227 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2228 break;
2229 };
2230
2231 // Rule (5). ALetter x ALetter
2232 if (fALetterSet->contains(c1) &&
2233 fALetterSet->contains(c2)) {
2234 continue;
2235 }
2236
2237 // Rule (6) ALetter x (MidLetter | MidNumLet) ALetter
2238 //
2239 if ( fALetterSet->contains(c1) &&
2240 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2)) &&
2241 fALetterSet->contains(c3)) {
2242 continue;
2243 }
2244
2245
2246 // Rule (7) ALetter (MidLetter | MidNumLet) x ALetter
2247 if (fALetterSet->contains(c0) &&
2248 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1)) &&
2249 fALetterSet->contains(c2)) {
2250 continue;
2251 }
2252
2253 // Rule (8) Numeric x Numeric
2254 if (fNumericSet->contains(c1) &&
2255 fNumericSet->contains(c2)) {
2256 continue;
2257 }
2258
2259 // Rule (9) ALetter x Numeric
2260 if (fALetterSet->contains(c1) &&
2261 fNumericSet->contains(c2)) {
2262 continue;
2263 }
2264
2265 // Rule (10) Numeric x ALetter
2266 if (fNumericSet->contains(c1) &&
2267 fALetterSet->contains(c2)) {
2268 continue;
2269 }
2270
2271 // Rule (11) Numeric (MidNum | MidNumLet) x Numeric
2272 if (fNumericSet->contains(c0) &&
2273 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1)) &&
2274 fNumericSet->contains(c2)) {
2275 continue;
2276 }
2277
2278 // Rule (12) Numeric x (MidNum | MidNumLet) Numeric
2279 if (fNumericSet->contains(c1) &&
2280 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2)) &&
2281 fNumericSet->contains(c3)) {
2282 continue;
2283 }
2284
2285 // Rule (13) Katakana x Katakana
2286 if (fKatakanaSet->contains(c1) &&
2287 fKatakanaSet->contains(c2)) {
2288 continue;
2289 }
2290
2291 // Rule 13a
2292 if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) ||
2293 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2294 fExtendNumLetSet->contains(c2)) {
2295 continue;
2296 }
2297
2298 // Rule 13b
2299 if (fExtendNumLetSet->contains(c1) &&
2300 (fALetterSet->contains(c2) || fNumericSet->contains(c2) ||
2301 fKatakanaSet->contains(c2))) {
2302 continue;
2303 }
2304
2305 // Rule 13c
2306 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2307 continue;
2308 }
2309
2310 // Rule 14. Break found here.
2311 break;
2312 }
2313
2314 breakPos = p2;
2315 return breakPos;
2316 }
2317
2318
2319 UVector *RBBIWordMonkey::charClasses() {
2320 return fSets;
2321 }
2322
2323
2324 RBBIWordMonkey::~RBBIWordMonkey() {
2325 delete fSets;
2326 delete fCRSet;
2327 delete fLFSet;
2328 delete fNewlineSet;
2329 delete fKatakanaSet;
2330 delete fALetterSet;
2331 delete fMidNumLetSet;
2332 delete fMidLetterSet;
2333 delete fMidNumSet;
2334 delete fNumericSet;
2335 delete fFormatSet;
2336 delete fExtendSet;
2337 delete fExtendNumLetSet;
2338 delete fRegionalIndicatorSet;
2339 delete fDictionaryCjkSet;
2340 delete fOtherSet;
2341 }
2342
2343
2344
2345
2346 //------------------------------------------------------------------------------------------
2347 //
2348 // class RBBISentMonkey Sentence Break specific implementation
2349 // of RBBIMonkeyKind.
2350 //
2351 //------------------------------------------------------------------------------------------
2352 class RBBISentMonkey: public RBBIMonkeyKind {
2353 public:
2354 RBBISentMonkey();
2355 virtual ~RBBISentMonkey();
2356 virtual UVector *charClasses();
2357 virtual void setText(const UnicodeString &s);
2358 virtual int32_t next(int32_t i);
2359 private:
2360 int moveBack(int posFrom);
2361 int moveForward(int posFrom);
2362 UChar32 cAt(int pos);
2363
2364 UVector *fSets;
2365
2366 UnicodeSet *fSepSet;
2367 UnicodeSet *fFormatSet;
2368 UnicodeSet *fSpSet;
2369 UnicodeSet *fLowerSet;
2370 UnicodeSet *fUpperSet;
2371 UnicodeSet *fOLetterSet;
2372 UnicodeSet *fNumericSet;
2373 UnicodeSet *fATermSet;
2374 UnicodeSet *fSContinueSet;
2375 UnicodeSet *fSTermSet;
2376 UnicodeSet *fCloseSet;
2377 UnicodeSet *fOtherSet;
2378 UnicodeSet *fExtendSet;
2379
2380 const UnicodeString *fText;
2381
2382 };
2383
2384 RBBISentMonkey::RBBISentMonkey()
2385 {
2386 UErrorCode status = U_ZERO_ERROR;
2387
2388 fSets = new UVector(status);
2389
2390 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
2391 // set and made into character classes of their own. For the monkey impl,
2392 // they remain in SEP, since Sep always appears with CR and LF in the rules.
2393 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status);
2394 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status);
2395 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status);
2396 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status);
2397 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status);
2398 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status);
2399 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status);
2400 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status);
2401 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2402 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status);
2403 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status);
2404 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status);
2405 fOtherSet = new UnicodeSet();
2406
2407 if(U_FAILURE(status)) {
2408 deferredStatus = status;
2409 return;
2410 }
2411
2412 fOtherSet->complement();
2413 fOtherSet->removeAll(*fSepSet);
2414 fOtherSet->removeAll(*fFormatSet);
2415 fOtherSet->removeAll(*fSpSet);
2416 fOtherSet->removeAll(*fLowerSet);
2417 fOtherSet->removeAll(*fUpperSet);
2418 fOtherSet->removeAll(*fOLetterSet);
2419 fOtherSet->removeAll(*fNumericSet);
2420 fOtherSet->removeAll(*fATermSet);
2421 fOtherSet->removeAll(*fSContinueSet);
2422 fOtherSet->removeAll(*fSTermSet);
2423 fOtherSet->removeAll(*fCloseSet);
2424 fOtherSet->removeAll(*fExtendSet);
2425
2426 fSets->addElement(fSepSet, status);
2427 fSets->addElement(fFormatSet, status);
2428 fSets->addElement(fSpSet, status);
2429 fSets->addElement(fLowerSet, status);
2430 fSets->addElement(fUpperSet, status);
2431 fSets->addElement(fOLetterSet, status);
2432 fSets->addElement(fNumericSet, status);
2433 fSets->addElement(fATermSet, status);
2434 fSets->addElement(fSContinueSet, status);
2435 fSets->addElement(fSTermSet, status);
2436 fSets->addElement(fCloseSet, status);
2437 fSets->addElement(fOtherSet, status);
2438 fSets->addElement(fExtendSet, status);
2439
2440 if (U_FAILURE(status)) {
2441 deferredStatus = status;
2442 }
2443 }
2444
2445
2446
2447 void RBBISentMonkey::setText(const UnicodeString &s) {
2448 fText = &s;
2449 }
2450
2451 UVector *RBBISentMonkey::charClasses() {
2452 return fSets;
2453 }
2454
2455
2456 // moveBack() Find the "significant" code point preceding the index i.
2457 // Skips over ($Extend | $Format)* .
2458 //
2459 int RBBISentMonkey::moveBack(int i) {
2460 if (i <= 0) {
2461 return -1;
2462 }
2463 UChar32 c;
2464 int32_t j = i;
2465 do {
2466 j = fText->moveIndex32(j, -1);
2467 c = fText->char32At(j);
2468 }
2469 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2470 return j;
2471
2472 }
2473
2474
2475 int RBBISentMonkey::moveForward(int i) {
2476 if (i>=fText->length()) {
2477 return fText->length();
2478 }
2479 UChar32 c;
2480 int32_t j = i;
2481 do {
2482 j = fText->moveIndex32(j, 1);
2483 c = cAt(j);
2484 }
2485 while (fFormatSet->contains(c) || fExtendSet->contains(c));
2486 return j;
2487 }
2488
2489 UChar32 RBBISentMonkey::cAt(int pos) {
2490 if (pos<0 || pos>=fText->length()) {
2491 return -1;
2492 } else {
2493 return fText->char32At(pos);
2494 }
2495 }
2496
2497 int32_t RBBISentMonkey::next(int32_t prevPos) {
2498 int p0, p1, p2, p3; // Indices of the significant code points around the
2499 // break position being tested. The candidate break
2500 // location is before p2.
2501
2502 int breakPos = -1;
2503
2504 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2505 UChar32 c;
2506
2507 if (U_FAILURE(deferredStatus)) {
2508 return -1;
2509 }
2510
2511 // Prev break at end of string. return DONE.
2512 if (prevPos >= fText->length()) {
2513 return -1;
2514 }
2515 p0 = p1 = p2 = p3 = prevPos;
2516 c3 = fText->char32At(prevPos);
2517 c0 = c1 = c2 = 0;
2518
2519 // Loop runs once per "significant" character position in the input text.
2520 for (;;) {
2521 // Move all of the positions forward in the input string.
2522 p0 = p1; c0 = c1;
2523 p1 = p2; c1 = c2;
2524 p2 = p3; c2 = c3;
2525
2526 // Advancd p3 by X(Extend | Format)* Rule 4
2527 p3 = moveForward(p3);
2528 c3 = cAt(p3);
2529
2530 // Rule (3) CR x LF
2531 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2532 continue;
2533 }
2534
2535 // Rule (4). Sep <break>
2536 if (fSepSet->contains(c1)) {
2537 p2 = p1+1; // Separators don't combine with Extend or Format.
2538 break;
2539 }
2540
2541 if (p2 >= fText->length()) {
2542 // Reached end of string. Always a break position.
2543 break;
2544 }
2545
2546 if (p2 == prevPos) {
2547 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2548 continue;
2549 }
2550
2551 // Rule (6). ATerm x Numeric
2552 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) {
2553 continue;
2554 }
2555
2556 // Rule (7). Upper ATerm x Uppper
2557 if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2558 continue;
2559 }
2560
2561 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2562 // Note: STerm | ATerm are added to the negated part of the expression by a
2563 // note to the Unicode 5.0 documents.
2564 int p8 = p1;
2565 while (fSpSet->contains(cAt(p8))) {
2566 p8 = moveBack(p8);
2567 }
2568 while (fCloseSet->contains(cAt(p8))) {
2569 p8 = moveBack(p8);
2570 }
2571 if (fATermSet->contains(cAt(p8))) {
2572 p8=p2;
2573 for (;;) {
2574 c = cAt(p8);
2575 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2576 fLowerSet->contains(c) || fSepSet->contains(c) ||
2577 fATermSet->contains(c) || fSTermSet->contains(c)) {
2578 break;
2579 }
2580 p8 = moveForward(p8);
2581 }
2582 if (fLowerSet->contains(cAt(p8))) {
2583 continue;
2584 }
2585 }
2586
2587 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2588 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2589 p8 = p1;
2590 while (fSpSet->contains(cAt(p8))) {
2591 p8 = moveBack(p8);
2592 }
2593 while (fCloseSet->contains(cAt(p8))) {
2594 p8 = moveBack(p8);
2595 }
2596 c = cAt(p8);
2597 if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2598 continue;
2599 }
2600 }
2601
2602 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)
2603 int p9 = p1;
2604 while (fCloseSet->contains(cAt(p9))) {
2605 p9 = moveBack(p9);
2606 }
2607 c = cAt(p9);
2608 if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2609 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2610 continue;
2611 }
2612 }
2613
2614 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)
2615 int p10 = p1;
2616 while (fSpSet->contains(cAt(p10))) {
2617 p10 = moveBack(p10);
2618 }
2619 while (fCloseSet->contains(cAt(p10))) {
2620 p10 = moveBack(p10);
2621 }
2622 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2623 if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2624 continue;
2625 }
2626 }
2627
2628 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break>
2629 int p11 = p1;
2630 if (fSepSet->contains(cAt(p11))) {
2631 p11 = moveBack(p11);
2632 }
2633 while (fSpSet->contains(cAt(p11))) {
2634 p11 = moveBack(p11);
2635 }
2636 while (fCloseSet->contains(cAt(p11))) {
2637 p11 = moveBack(p11);
2638 }
2639 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2640 break;
2641 }
2642
2643 // Rule (12) Any x Any
2644 continue;
2645 }
2646 breakPos = p2;
2647 return breakPos;
2648 }
2649
2650 RBBISentMonkey::~RBBISentMonkey() {
2651 delete fSets;
2652 delete fSepSet;
2653 delete fFormatSet;
2654 delete fSpSet;
2655 delete fLowerSet;
2656 delete fUpperSet;
2657 delete fOLetterSet;
2658 delete fNumericSet;
2659 delete fATermSet;
2660 delete fSContinueSet;
2661 delete fSTermSet;
2662 delete fCloseSet;
2663 delete fOtherSet;
2664 delete fExtendSet;
2665 }
2666
2667
2668
2669 //-------------------------------------------------------------------------------------------
2670 //
2671 // RBBILineMonkey
2672 //
2673 //-------------------------------------------------------------------------------------------
2674
2675 class RBBILineMonkey: public RBBIMonkeyKind {
2676 public:
2677 RBBILineMonkey();
2678 virtual ~RBBILineMonkey();
2679 virtual UVector *charClasses();
2680 virtual void setText(const UnicodeString &s);
2681 virtual int32_t next(int32_t i);
2682 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2683 private:
2684 UVector *fSets;
2685
2686 UnicodeSet *fBK;
2687 UnicodeSet *fCR;
2688 UnicodeSet *fLF;
2689 UnicodeSet *fCM;
2690 UnicodeSet *fNL;
2691 UnicodeSet *fSG;
2692 UnicodeSet *fWJ;
2693 UnicodeSet *fZW;
2694 UnicodeSet *fGL;
2695 UnicodeSet *fCB;
2696 UnicodeSet *fSP;
2697 UnicodeSet *fB2;
2698 UnicodeSet *fBA;
2699 UnicodeSet *fBB;
2700 UnicodeSet *fHY;
2701 UnicodeSet *fH2;
2702 UnicodeSet *fH3;
2703 UnicodeSet *fCL;
2704 UnicodeSet *fCP;
2705 UnicodeSet *fEX;
2706 UnicodeSet *fIN;
2707 UnicodeSet *fJL;
2708 UnicodeSet *fJV;
2709 UnicodeSet *fJT;
2710 UnicodeSet *fNS;
2711 UnicodeSet *fOP;
2712 UnicodeSet *fQU;
2713 UnicodeSet *fIS;
2714 UnicodeSet *fNU;
2715 UnicodeSet *fPO;
2716 UnicodeSet *fPR;
2717 UnicodeSet *fSY;
2718 UnicodeSet *fAI;
2719 UnicodeSet *fAL;
2720 UnicodeSet *fCJ;
2721 UnicodeSet *fHL;
2722 UnicodeSet *fID;
2723 UnicodeSet *fRI;
2724 UnicodeSet *fSA;
2725 UnicodeSet *fXX;
2726
2727 BreakIterator *fCharBI;
2728
2729 const UnicodeString *fText;
2730 int32_t *fOrigPositions;
2731
2732 RegexMatcher *fNumberMatcher;
2733 RegexMatcher *fLB11Matcher;
2734 };
2735
2736
2737 RBBILineMonkey::RBBILineMonkey()
2738 {
2739 UErrorCode status = U_ZERO_ERROR;
2740
2741 fSets = new UVector(status);
2742
2743 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2744 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2745 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2746 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2747 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2748 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2749 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2750 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2751 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2752 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2753 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2754 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2755 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2756 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2757 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2758 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2759 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
2760 fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
2761 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2762 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2763 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2764 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2765 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2766 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2767 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
2768 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
2769 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2770 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2771 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2772 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2773 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2774 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2775 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
2776 fCJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2777 fHL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
2778 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
2779 fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
2780 fSA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status);
2781 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2782 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
2783
2784 if (U_FAILURE(status)) {
2785 deferredStatus = status;
2786 fCharBI = NULL;
2787 fNumberMatcher = NULL;
2788 return;
2789 }
2790
2791 fAL->addAll(*fXX); // Default behavior for XX is identical to AL
2792 fAL->addAll(*fAI); // Default behavior for AI is identical to AL
2793 fAL->addAll(*fSA); // Default behavior for SA is XX, which defaults to AL
2794 fAL->addAll(*fSG); // Default behavior for SG is identical to AL.
2795
2796 fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS.
2797
2798 fSets->addElement(fBK, status);
2799 fSets->addElement(fCR, status);
2800 fSets->addElement(fLF, status);
2801 fSets->addElement(fCM, status);
2802 fSets->addElement(fNL, status);
2803 fSets->addElement(fWJ, status);
2804 fSets->addElement(fZW, status);
2805 fSets->addElement(fGL, status);
2806 fSets->addElement(fCB, status);
2807 fSets->addElement(fSP, status);
2808 fSets->addElement(fB2, status);
2809 fSets->addElement(fBA, status);
2810 fSets->addElement(fBB, status);
2811 fSets->addElement(fHY, status);
2812 fSets->addElement(fH2, status);
2813 fSets->addElement(fH3, status);
2814 fSets->addElement(fCL, status);
2815 fSets->addElement(fCP, status);
2816 fSets->addElement(fEX, status);
2817 fSets->addElement(fIN, status);
2818 fSets->addElement(fJL, status);
2819 fSets->addElement(fJT, status);
2820 fSets->addElement(fJV, status);
2821 fSets->addElement(fNS, status);
2822 fSets->addElement(fOP, status);
2823 fSets->addElement(fQU, status);
2824 fSets->addElement(fIS, status);
2825 fSets->addElement(fNU, status);
2826 fSets->addElement(fPO, status);
2827 fSets->addElement(fPR, status);
2828 fSets->addElement(fSY, status);
2829 fSets->addElement(fAI, status);
2830 fSets->addElement(fAL, status);
2831 fSets->addElement(fHL, status);
2832 fSets->addElement(fID, status);
2833 fSets->addElement(fWJ, status);
2834 fSets->addElement(fRI, status);
2835 fSets->addElement(fSA, status);
2836 fSets->addElement(fSG, status);
2837
2838 const char *rules =
2839 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
2840 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
2841 "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
2842 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
2843 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
2844 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
2845
2846 fNumberMatcher = new RegexMatcher(
2847 UnicodeString(rules, -1, US_INV), 0, status);
2848
2849 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2850
2851 if (U_FAILURE(status)) {
2852 deferredStatus = status;
2853 }
2854 }
2855
2856
2857 void RBBILineMonkey::setText(const UnicodeString &s) {
2858 fText = &s;
2859 fCharBI->setText(s);
2860 fNumberMatcher->reset(s);
2861 }
2862
2863 //
2864 // rule9Adjust
2865 // Line Break TR rules 9 and 10 implementation.
2866 // This deals with combining marks and other sequences that
2867 // that must be treated as if they were something other than what they actually are.
2868 //
2869 // This is factored out into a separate function because it must be applied twice for
2870 // each potential break, once to the chars before the position being checked, then
2871 // again to the text following the possible break.
2872 //
2873 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2874 if (pos == -1) {
2875 // Invalid initial position. Happens during the warmup iteration of the
2876 // main loop in next().
2877 return;
2878 }
2879
2880 int32_t nPos = *nextPos;
2881
2882 // LB 9 Keep combining sequences together.
2883 // advance over any CM class chars. Note that Line Break CM is different
2884 // from the normal Grapheme Extend property.
2885 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
2886 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
2887 for (;;) {
2888 *nextChar = fText->char32At(nPos);
2889 if (!fCM->contains(*nextChar)) {
2890 break;
2891 }
2892 nPos = fText->moveIndex32(nPos, 1);
2893 }
2894 }
2895
2896
2897 // LB 9 Treat X CM* as if it were x.
2898 // No explicit action required.
2899
2900 // LB 10 Treat any remaining combining mark as AL
2901 if (fCM->contains(*posChar)) {
2902 *posChar = 0x41; // thisChar = 'A';
2903 }
2904
2905 // Push the updated nextPos and nextChar back to our caller.
2906 // This only makes a difference if posChar got bigger by consuming a
2907 // combining sequence.
2908 *nextPos = nPos;
2909 *nextChar = fText->char32At(nPos);
2910 }
2911
2912
2913
2914 int32_t RBBILineMonkey::next(int32_t startPos) {
2915 UErrorCode status = U_ZERO_ERROR;
2916 int32_t pos; // Index of the char following a potential break position
2917 UChar32 thisChar; // Character at above position "pos"
2918
2919 int32_t prevPos; // Index of the char preceding a potential break position
2920 UChar32 prevChar; // Character at above position. Note that prevChar
2921 // and thisChar may not be adjacent because combining
2922 // characters between them will be ignored.
2923
2924 int32_t prevPosX2; // Second previous character. Wider context for LB21a.
2925 UChar32 prevCharX2;
2926
2927 int32_t nextPos; // Index of the next character following pos.
2928 // Usually skips over combining marks.
2929 int32_t nextCPPos; // Index of the code point following "pos."
2930 // May point to a combining mark.
2931 int32_t tPos; // temp value.
2932 UChar32 c;
2933
2934 if (U_FAILURE(deferredStatus)) {
2935 return -1;
2936 }
2937
2938 if (startPos >= fText->length()) {
2939 return -1;
2940 }
2941
2942
2943 // Initial values for loop. Loop will run the first time without finding breaks,
2944 // while the invalid values shift out and the "this" and
2945 // "prev" positions are filled in with good values.
2946 pos = prevPos = prevPosX2 = -1; // Invalid value, serves as flag for initial loop iteration.
2947 thisChar = prevChar = prevCharX2 = 0;
2948 nextPos = nextCPPos = startPos;
2949
2950
2951 // Loop runs once per position in the test text, until a break position
2952 // is found.
2953 for (;;) {
2954 prevPosX2 = prevPos;
2955 prevCharX2 = prevChar;
2956
2957 prevPos = pos;
2958 prevChar = thisChar;
2959
2960 pos = nextPos;
2961 thisChar = fText->char32At(pos);
2962
2963 nextCPPos = fText->moveIndex32(pos, 1);
2964 nextPos = nextCPPos;
2965
2966 // Rule LB2 - Break at end of text.
2967 if (pos >= fText->length()) {
2968 break;
2969 }
2970
2971 // Rule LB 9 - adjust for combining sequences.
2972 // We do this one out-of-order because the adjustment does not change anything
2973 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
2974 // be applied.
2975 rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
2976 nextCPPos = nextPos = fText->moveIndex32(pos, 1);
2977 c = fText->char32At(nextPos);
2978 rule9Adjust(pos, &thisChar, &nextPos, &c);
2979
2980 // If the loop is still warming up - if we haven't shifted the initial
2981 // -1 positions out of prevPos yet - loop back to advance the
2982 // position in the input without any further looking for breaks.
2983 if (prevPos == -1) {
2984 continue;
2985 }
2986
2987 // LB 4 Always break after hard line breaks,
2988 if (fBK->contains(prevChar)) {
2989 break;
2990 }
2991
2992 // LB 5 Break after CR, LF, NL, but not inside CR LF
2993 if (prevChar == 0x0d && thisChar == 0x0a) {
2994 continue;
2995 }
2996 if (prevChar == 0x0d ||
2997 prevChar == 0x0a ||
2998 prevChar == 0x85) {
2999 break;
3000 }
3001
3002 // LB 6 Don't break before hard line breaks
3003 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3004 fBK->contains(thisChar)) {
3005 continue;
3006 }
3007
3008
3009 // LB 7 Don't break before spaces or zero-width space.
3010 if (fSP->contains(thisChar)) {
3011 continue;
3012 }
3013
3014 if (fZW->contains(thisChar)) {
3015 continue;
3016 }
3017
3018 // LB 8 Break after zero width space
3019 if (fZW->contains(prevChar)) {
3020 break;
3021 }
3022
3023 // LB 9, 10 Already done, at top of loop.
3024 //
3025
3026
3027 // LB 11 Do not break before or after WORD JOINER and related characters.
3028 // x WJ
3029 // WJ x
3030 //
3031 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3032 continue;
3033 }
3034
3035 // LB 12
3036 // GL x
3037 if (fGL->contains(prevChar)) {
3038 continue;
3039 }
3040
3041 // LB 12a
3042 // [^SP BA HY] x GL
3043 if (!(fSP->contains(prevChar) ||
3044 fBA->contains(prevChar) ||
3045 fHY->contains(prevChar) ) && fGL->contains(thisChar)) {
3046 continue;
3047 }
3048
3049
3050
3051 // LB 13 Don't break before closings.
3052 // NU x CL, NU x CP and NU x IS are not matched here so that they will
3053 // fall into LB 17 and the more general number regular expression.
3054 //
3055 if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
3056 (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
3057 fEX->contains(thisChar) ||
3058 (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
3059 (!fNU->contains(prevChar) && fSY->contains(thisChar))) {
3060 continue;
3061 }
3062
3063 // LB 14 Don't break after OP SP*
3064 // Scan backwards, checking for this sequence.
3065 // The OP char could include combining marks, so we actually check for
3066 // OP CM* SP*
3067 // Another Twist: The Rule 67 fixes may have changed a SP CM
3068 // sequence into a ID char, so before scanning back through spaces,
3069 // verify that prevChar is indeed a space. The prevChar variable
3070 // may differ from fText[prevPos]
3071 tPos = prevPos;
3072 if (fSP->contains(prevChar)) {
3073 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3074 tPos=fText->moveIndex32(tPos, -1);
3075 }
3076 }
3077 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3078 tPos=fText->moveIndex32(tPos, -1);
3079 }
3080 if (fOP->contains(fText->char32At(tPos))) {
3081 continue;
3082 }
3083
3084
3085 // LB 15 QU SP* x OP
3086 if (fOP->contains(thisChar)) {
3087 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3088 int tPos = prevPos;
3089 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3090 tPos = fText->moveIndex32(tPos, -1);
3091 }
3092 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3093 tPos = fText->moveIndex32(tPos, -1);
3094 }
3095 if (fQU->contains(fText->char32At(tPos))) {
3096 continue;
3097 }
3098 }
3099
3100
3101
3102 // LB 16 (CL | CP) SP* x NS
3103 // Scan backwards for SP* CM* (CL | CP)
3104 if (fNS->contains(thisChar)) {
3105 int tPos = prevPos;
3106 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3107 tPos = fText->moveIndex32(tPos, -1);
3108 }
3109 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3110 tPos = fText->moveIndex32(tPos, -1);
3111 }
3112 if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3113 continue;
3114 }
3115 }
3116
3117
3118 // LB 17 B2 SP* x B2
3119 if (fB2->contains(thisChar)) {
3120 // Scan backwards, checking for the B2 CM* SP* sequence.
3121 tPos = prevPos;
3122 if (fSP->contains(prevChar)) {
3123 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3124 tPos=fText->moveIndex32(tPos, -1);
3125 }
3126 }
3127 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3128 tPos=fText->moveIndex32(tPos, -1);
3129 }
3130 if (fB2->contains(fText->char32At(tPos))) {
3131 continue;
3132 }
3133 }
3134
3135
3136 // LB 18 break after space
3137 if (fSP->contains(prevChar)) {
3138 break;
3139 }
3140
3141 // LB 19
3142 // x QU
3143 // QU x
3144 if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3145 continue;
3146 }
3147
3148 // LB 20 Break around a CB
3149 if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3150 break;
3151 }
3152
3153 // LB 21
3154 if (fBA->contains(thisChar) ||
3155 fHY->contains(thisChar) ||
3156 fNS->contains(thisChar) ||
3157 fBB->contains(prevChar) ) {
3158 continue;
3159 }
3160
3161 // LB 21a
3162 // HL (HY | BA) x
3163 if (fHL->contains(prevCharX2) &&
3164 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3165 continue;
3166 }
3167
3168 // LB 21b - Added for Apple 13927604
3169 if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3170 continue;
3171 }
3172
3173 // LB 22
3174 if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
3175 (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
3176 (fID->contains(prevChar) && fIN->contains(thisChar)) ||
3177 (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
3178 (fNU->contains(prevChar) && fIN->contains(thisChar)) ) {
3179 continue;
3180 }
3181
3182
3183 // LB 23 ID x PO
3184 // AL x NU
3185 // HL x NU
3186 // NU x AL
3187 if ((fID->contains(prevChar) && fPO->contains(thisChar)) ||
3188 (fAL->contains(prevChar) && fNU->contains(thisChar)) ||
3189 (fHL->contains(prevChar) && fNU->contains(thisChar)) ||
3190 (fNU->contains(prevChar) && fAL->contains(thisChar)) ||
3191 (fNU->contains(prevChar) && fHL->contains(thisChar)) ) {
3192 continue;
3193 }
3194
3195 // LB 24 Do not break between prefix and letters or ideographs.
3196 // PR x ID
3197 // PR x (AL | HL)
3198 // PO x (AL | HL)
3199 if ((fPR->contains(prevChar) && fID->contains(thisChar)) ||
3200 (fPR->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) ||
3201 (fPO->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar)))) {
3202 continue;
3203 }
3204
3205
3206
3207 // LB 25 Numbers
3208 if (fNumberMatcher->lookingAt(prevPos, status)) {
3209 if (U_FAILURE(status)) {
3210 break;
3211 }
3212 // Matched a number. But could have been just a single digit, which would
3213 // not represent a "no break here" between prevChar and thisChar
3214 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num
3215 if (numEndIdx > pos) {
3216 // Number match includes at least our two chars being checked
3217 if (numEndIdx > nextPos) {
3218 // Number match includes additional chars. Update pos and nextPos
3219 // so that next loop iteration will continue at the end of the number,
3220 // checking for breaks between last char in number & whatever follows.
3221 pos = nextPos = numEndIdx;
3222 do {
3223 pos = fText->moveIndex32(pos, -1);
3224 thisChar = fText->char32At(pos);
3225 } while (fCM->contains(thisChar));
3226 }
3227 continue;
3228 }
3229 }
3230
3231
3232 // LB 26 Do not break a Korean syllable.
3233 if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3234 fJV->contains(thisChar) ||
3235 fH2->contains(thisChar) ||
3236 fH3->contains(thisChar))) {
3237 continue;
3238 }
3239
3240 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) &&
3241 (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3242 continue;
3243 }
3244
3245 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3246 fJT->contains(thisChar)) {
3247 continue;
3248 }
3249
3250 // LB 27 Treat a Korean Syllable Block the same as ID.
3251 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3252 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3253 fIN->contains(thisChar)) {
3254 continue;
3255 }
3256 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3257 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3258 fPO->contains(thisChar)) {
3259 continue;
3260 }
3261 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3262 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3263 continue;
3264 }
3265
3266
3267
3268 // LB 28 Do not break between alphabetics ("at").
3269 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3270 continue;
3271 }
3272
3273 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g.").
3274 if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3275 continue;
3276 }
3277
3278 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3279 // (AL | NU) x OP
3280 // CP x (AL | NU)
3281 if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
3282 continue;
3283 }
3284 if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3285 continue;
3286 }
3287
3288 // LB30a Do not break between regional indicators.
3289 // RI x RI
3290 if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3291 continue;
3292 }
3293
3294 // LB 31 Break everywhere else
3295 break;
3296
3297 }
3298
3299 return pos;
3300 }
3301
3302
3303 UVector *RBBILineMonkey::charClasses() {
3304 return fSets;
3305 }
3306
3307
3308 RBBILineMonkey::~RBBILineMonkey() {
3309 delete fSets;
3310
3311 delete fBK;
3312 delete fCR;
3313 delete fLF;
3314 delete fCM;
3315 delete fNL;
3316 delete fWJ;
3317 delete fZW;
3318 delete fGL;
3319 delete fCB;
3320 delete fSP;
3321 delete fB2;
3322 delete fBA;
3323 delete fBB;
3324 delete fHY;
3325 delete fH2;
3326 delete fH3;
3327 delete fCL;
3328 delete fCP;
3329 delete fEX;
3330 delete fIN;
3331 delete fJL;
3332 delete fJV;
3333 delete fJT;
3334 delete fNS;
3335 delete fOP;
3336 delete fQU;
3337 delete fIS;
3338 delete fNU;
3339 delete fPO;
3340 delete fPR;
3341 delete fSY;
3342 delete fAI;
3343 delete fAL;
3344 delete fCJ;
3345 delete fHL;
3346 delete fID;
3347 delete fRI;
3348 delete fSA;
3349 delete fSG;
3350 delete fXX;
3351
3352 delete fCharBI;
3353 delete fNumberMatcher;
3354 }
3355
3356
3357 //-------------------------------------------------------------------------------------------
3358 //
3359 // TestMonkey
3360 //
3361 // params
3362 // seed=nnnnn Random number starting seed.
3363 // Setting the seed allows errors to be reproduced.
3364 // loop=nnn Looping count. Controls running time.
3365 // -1: run forever.
3366 // 0 or greater: run length.
3367 //
3368 // type = char | word | line | sent | title
3369 //
3370 //-------------------------------------------------------------------------------------------
3371
3372 static int32_t getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3373 int32_t val = defaultVal;
3374 name.append(" *= *(-?\\d+)");
3375 UErrorCode status = U_ZERO_ERROR;
3376 RegexMatcher m(name, params, 0, status);
3377 if (m.find()) {
3378 // The param exists. Convert the string to an int.
3379 char valString[100];
3380 int32_t paramLength = m.end(1, status) - m.start(1, status);
3381 if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3382 paramLength = (int32_t)(sizeof(valString)-2);
3383 }
3384 params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3385 val = strtol(valString, NULL, 10);
3386
3387 // Delete this parameter from the params string.
3388 m.reset();
3389 params = m.replaceFirst("", status);
3390 }
3391 U_ASSERT(U_SUCCESS(status));
3392 return val;
3393 }
3394 #endif
3395
3396 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3397 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3398 BreakIterator *bi,
3399 int expected[],
3400 int expectedcount)
3401 {
3402 int count = 0;
3403 int i = 0;
3404 int forward[50];
3405 bi->setText(ustr);
3406 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3407 forward[count] = i;
3408 if (count < expectedcount && expected[count] != i) {
3409 test->errln("break forward test failed: expected %d but got %d",
3410 expected[count], i);
3411 break;
3412 }
3413 count ++;
3414 }
3415 if (count != expectedcount) {
3416 printStringBreaks(ustr, expected, expectedcount);
3417 test->errln("break forward test failed: missed %d match",
3418 expectedcount - count);
3419 return;
3420 }
3421 // testing boundaries
3422 for (i = 1; i < expectedcount; i ++) {
3423 int j = expected[i - 1];
3424 if (!bi->isBoundary(j)) {
3425 printStringBreaks(ustr, expected, expectedcount);
3426 test->errln("isBoundary() failed. Expected boundary at position %d", j);
3427 return;
3428 }
3429 for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3430 if (bi->isBoundary(j)) {
3431 printStringBreaks(ustr, expected, expectedcount);
3432 test->errln("isBoundary() failed. Not expecting boundary at position %d", j);
3433 return;
3434 }
3435 }
3436 }
3437
3438 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3439 count --;
3440 if (forward[count] != i) {
3441 printStringBreaks(ustr, expected, expectedcount);
3442 test->errln("happy break test previous() failed: expected %d but got %d",
3443 forward[count], i);
3444 break;
3445 }
3446 }
3447 if (count != 0) {
3448 printStringBreaks(ustr, expected, expectedcount);
3449 test->errln("break test previous() failed: missed a match");
3450 return;
3451 }
3452
3453 // testing preceding
3454 for (i = 0; i < expectedcount - 1; i ++) {
3455 // int j = expected[i] + 1;
3456 int j = ustr.moveIndex32(expected[i], 1);
3457 for (; j <= expected[i + 1]; j ++) {
3458 if (bi->preceding(j) != expected[i]) {
3459 printStringBreaks(ustr, expected, expectedcount);
3460 test->errln("preceding(): Not expecting boundary at position %d", j);
3461 return;
3462 }
3463 }
3464 }
3465 }
3466 #endif
3467
3468 void RBBITest::TestWordBreaks(void)
3469 {
3470 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3471
3472 Locale locale("en");
3473 UErrorCode status = U_ZERO_ERROR;
3474 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3475 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3476 // Replaced any C+J characters in a row with a random sequence of characters
3477 // of the same length to make our C+J segmentation not get in the way.
3478 static const char *strlist[] =
3479 {
3480 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3481 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3482 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3483 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3484 "\\uac00\\u3588\\u009c\\u0953\\u194b",
3485 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3486 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3487 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3488 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3489 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3490 "\\u2027\\U000e0067\\u0a47\\u00b7",
3491 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3492 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3493 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3494 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3495 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3496 "\\u0027\\u11af\\U000e0057\\u0602",
3497 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3498 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3499 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3500 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3501 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3502 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3503 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3504 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3505 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3506 "\\u18f4\\U000e0049\\u20e7\\u2027",
3507 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3508 "\\ua183\\u102d\\u0bec\\u003a",
3509 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3510 "\\u003a\\u0e57\\u0fad\\u002e",
3511 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3512 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3513 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3514 "\\u003a\\u0664\\u00b7\\u1fba",
3515 "\\u003b\\u0027\\u00b7\\u47a3",
3516 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3517 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3518 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3519 };
3520 int loop;
3521 if (U_FAILURE(status)) {
3522 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3523 return;
3524 }
3525 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3526 // printf("looping %d\n", loop);
3527 UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3528 // RBBICharMonkey monkey;
3529 RBBIWordMonkey monkey;
3530
3531 int expected[50];
3532 int expectedcount = 0;
3533
3534 monkey.setText(ustr);
3535 int i;
3536 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3537 expected[expectedcount ++] = i;
3538 }
3539
3540 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3541 }
3542 delete bi;
3543 #endif
3544 }
3545
3546 void RBBITest::TestWordBoundary(void)
3547 {
3548 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3549 Locale locale("en");
3550 UErrorCode status = U_ZERO_ERROR;
3551 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3552 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3553 UChar str[50];
3554 static const char *strlist[] =
3555 {
3556 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3557 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3558 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3559 "\\u2027\\U000e0067\\u0a47\\u00b7",
3560 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3561 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3562 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3563 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3564 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3565 "\\u0027\\u11af\\U000e0057\\u0602",
3566 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3567 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3568 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3569 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3570 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3571 "\\U000e0065\\u302c\\u09ee\\U000e0068",
3572 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3573 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3574 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3575 "\\u58f4\\U000e0049\\u20e7\\u2027",
3576 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3577 "\\ua183\\u102d\\u0bec\\u003a",
3578 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3579 "\\u003a\\u0e57\\u0fad\\u002e",
3580 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3581 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3582 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3583 "\\u003a\\u0664\\u00b7\\u1fba",
3584 "\\u003b\\u0027\\u00b7\\u47a3",
3585 };
3586 int loop;
3587 if (U_FAILURE(status)) {
3588 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3589 return;
3590 }
3591 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3592 // printf("looping %d\n", loop);
3593 u_unescape(strlist[loop], str, 20);
3594 UnicodeString ustr(str);
3595 int forward[50];
3596 int count = 0;
3597
3598 bi->setText(ustr);
3599 int prev = 0;
3600 int i;
3601 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3602 forward[count ++] = i;
3603 if (i > prev) {
3604 int j;
3605 for (j = prev + 1; j < i; j ++) {
3606 if (bi->isBoundary(j)) {
3607 printStringBreaks(ustr, forward, count);
3608 errln("happy boundary test failed: expected %d not a boundary",
3609 j);
3610 return;
3611 }
3612 }
3613 }
3614 if (!bi->isBoundary(i)) {
3615 printStringBreaks(ustr, forward, count);
3616 errln("happy boundary test failed: expected %d a boundary",
3617 i);
3618 return;
3619 }
3620 prev = i;
3621 }
3622 }
3623 delete bi;
3624 }
3625
3626 void RBBITest::TestLineBreaks(void)
3627 {
3628 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3629 Locale locale("en");
3630 UErrorCode status = U_ZERO_ERROR;
3631 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3632 const int32_t STRSIZE = 50;
3633 UChar str[STRSIZE];
3634 static const char *strlist[] =
3635 {
3636 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3637 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3638 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3639 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3640 "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3641 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3642 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3643 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3644 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3645 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3646 "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
3647 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3648 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3649 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3650 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3651 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3652 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3653 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3654 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3655 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3656 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3657 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3658 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3659 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3660 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3661 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3662 "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
3663 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3664 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3665 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3666 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3667 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3668 "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
3669 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3670 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3671 "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
3672 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3673 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3674 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3675 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3676 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3677 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3678 "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
3679 "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
3680 "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
3681 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3682 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3683 };
3684 int loop;
3685 TEST_ASSERT_SUCCESS(status);
3686 if (U_FAILURE(status)) {
3687 return;
3688 }
3689 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3690 // printf("looping %d\n", loop);
3691 int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3692 if (t >= STRSIZE) {
3693 TEST_ASSERT(FALSE);
3694 continue;
3695 }
3696
3697
3698 UnicodeString ustr(str);
3699 RBBILineMonkey monkey;
3700 if (U_FAILURE(monkey.deferredStatus)) {
3701 continue;
3702 }
3703
3704 const int EXPECTEDSIZE = 50;
3705 int expected[EXPECTEDSIZE];
3706 int expectedcount = 0;
3707
3708 monkey.setText(ustr);
3709 int i;
3710 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3711 if (expectedcount >= EXPECTEDSIZE) {
3712 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3713 return;
3714 }
3715 expected[expectedcount ++] = i;
3716 }
3717
3718 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3719 }
3720 delete bi;
3721 #endif
3722 }
3723
3724 void RBBITest::TestSentBreaks(void)
3725 {
3726 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3727 Locale locale("en");
3728 UErrorCode status = U_ZERO_ERROR;
3729 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3730 UChar str[200];
3731 static const char *strlist[] =
3732 {
3733 "Now\ris\nthe\r\ntime\n\rfor\r\r",
3734 "This\n",
3735 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3736 "\"Sentence ending with a quote.\" Bye.",
3737 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
3738 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3739 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3740 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3741 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3742 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3743 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3744 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3745 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3746 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3747 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3748 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3749 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3750 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3751 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3752 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3753 };
3754 int loop;
3755 if (U_FAILURE(status)) {
3756 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3757 return;
3758 }
3759 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3760 u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
3761 UnicodeString ustr(str);
3762
3763 RBBISentMonkey monkey;
3764 if (U_FAILURE(monkey.deferredStatus)) {
3765 continue;
3766 }
3767
3768 const int EXPECTEDSIZE = 50;
3769 int expected[EXPECTEDSIZE];
3770 int expectedcount = 0;
3771
3772 monkey.setText(ustr);
3773 int i;
3774 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3775 if (expectedcount >= EXPECTEDSIZE) {
3776 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3777 return;
3778 }
3779 expected[expectedcount ++] = i;
3780 }
3781
3782 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3783 }
3784 delete bi;
3785 #endif
3786 }
3787
3788 void RBBITest::TestMonkey(char *params) {
3789 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3790
3791 UErrorCode status = U_ZERO_ERROR;
3792 int32_t loopCount = 500;
3793 int32_t seed = 1;
3794 UnicodeString breakType = "all";
3795 Locale locale("en");
3796 UBool useUText = FALSE;
3797
3798 if (quick == FALSE) {
3799 loopCount = 10000;
3800 }
3801
3802 if (params) {
3803 UnicodeString p(params);
3804 loopCount = getIntParam("loop", p, loopCount);
3805 seed = getIntParam("seed", p, seed);
3806
3807 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3808 if (m.find()) {
3809 breakType = m.group(1, status);
3810 m.reset();
3811 p = m.replaceFirst("", status);
3812 }
3813
3814 RegexMatcher u(" *utext", p, 0, status);
3815 if (u.find()) {
3816 useUText = TRUE;
3817 u.reset();
3818 p = u.replaceFirst("", status);
3819 }
3820
3821
3822 // m.reset(p);
3823 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
3824 // Each option is stripped out of the option string as it is processed.
3825 // All options have been checked. The option string should have been completely emptied..
3826 char buf[100];
3827 p.extract(buf, sizeof(buf), NULL, status);
3828 buf[sizeof(buf)-1] = 0;
3829 errln("Unrecognized or extra parameter: %s\n", buf);
3830 return;
3831 }
3832
3833 }
3834
3835 if (breakType == "char" || breakType == "all") {
3836 RBBICharMonkey m;
3837 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3838 if (U_SUCCESS(status)) {
3839 RunMonkey(bi, m, "char", seed, loopCount, useUText);
3840 if (breakType == "all" && useUText==FALSE) {
3841 // Also run a quick test with UText when "all" is specified
3842 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
3843 }
3844 }
3845 else {
3846 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
3847 }
3848 delete bi;
3849 }
3850
3851 if (breakType == "word" || breakType == "all") {
3852 logln("Word Break Monkey Test");
3853 RBBIWordMonkey m;
3854 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3855 if (U_SUCCESS(status)) {
3856 RunMonkey(bi, m, "word", seed, loopCount, useUText);
3857 }
3858 else {
3859 errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
3860 }
3861 delete bi;
3862 }
3863
3864 if (breakType == "line" || breakType == "all") {
3865 logln("Line Break Monkey Test");
3866 RBBILineMonkey m;
3867 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3868 if (loopCount >= 10) {
3869 loopCount = loopCount / 5; // Line break runs slower than the others.
3870 }
3871 if (U_SUCCESS(status)) {
3872 RunMonkey(bi, m, "line", seed, loopCount, useUText);
3873 }
3874 else {
3875 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3876 }
3877 delete bi;
3878 }
3879
3880 if (breakType == "sent" || breakType == "all" ) {
3881 logln("Sentence Break Monkey Test");
3882 RBBISentMonkey m;
3883 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3884 if (loopCount >= 10) {
3885 loopCount = loopCount / 10; // Sentence runs slower than the other break types
3886 }
3887 if (U_SUCCESS(status)) {
3888 RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
3889 }
3890 else {
3891 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3892 }
3893 delete bi;
3894 }
3895
3896 #endif
3897 }
3898
3899 //
3900 // Run a RBBI monkey test. Common routine, for all break iterator types.
3901 // Parameters:
3902 // bi - the break iterator to use
3903 // mk - MonkeyKind, abstraction for obtaining expected results
3904 // name - Name of test (char, word, etc.) for use in error messages
3905 // seed - Seed for starting random number generator (parameter from user)
3906 // numIterations
3907 //
3908 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed,
3909 int32_t numIterations, UBool useUText) {
3910
3911 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3912
3913 const int32_t TESTSTRINGLEN = 500;
3914 UnicodeString testText;
3915 int32_t numCharClasses;
3916 UVector *chClasses;
3917 int expected[TESTSTRINGLEN*2 + 1];
3918 int expectedCount = 0;
3919 char expectedBreaks[TESTSTRINGLEN*2 + 1];
3920 char forwardBreaks[TESTSTRINGLEN*2 + 1];
3921 char reverseBreaks[TESTSTRINGLEN*2+1];
3922 char isBoundaryBreaks[TESTSTRINGLEN*2+1];
3923 char followingBreaks[TESTSTRINGLEN*2+1];
3924 char precedingBreaks[TESTSTRINGLEN*2+1];
3925 int i;
3926 int loopCount = 0;
3927
3928 m_seed = seed;
3929
3930 numCharClasses = mk.charClasses()->size();
3931 chClasses = mk.charClasses();
3932
3933 // Check for errors that occured during the construction of the MonkeyKind object.
3934 // Can't report them where they occured because errln() is a method coming from intlTest,
3935 // and is not visible outside of RBBITest :-(
3936 if (U_FAILURE(mk.deferredStatus)) {
3937 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
3938 return;
3939 }
3940
3941 // Verify that the character classes all have at least one member.
3942 for (i=0; i<numCharClasses; i++) {
3943 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
3944 if (s == NULL || s->size() == 0) {
3945 errln("Character Class #%d is null or of zero size.", i);
3946 return;
3947 }
3948 }
3949
3950 while (loopCount < numIterations || numIterations == -1) {
3951 if (numIterations == -1 && loopCount % 10 == 0) {
3952 // If test is running in an infinite loop, display a periodic tic so
3953 // we can tell that it is making progress.
3954 fprintf(stderr, ".");
3955 }
3956 // Save current random number seed, so that we can recreate the random numbers
3957 // for this loop iteration in event of an error.
3958 seed = m_seed;
3959
3960 // Populate a test string with data.
3961 testText.truncate(0);
3962 for (i=0; i<TESTSTRINGLEN; i++) {
3963 int32_t aClassNum = m_rand() % numCharClasses;
3964 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
3965 int32_t charIdx = m_rand() % classSet->size();
3966 UChar32 c = classSet->charAt(charIdx);
3967 if (c < 0) { // TODO: deal with sets containing strings.
3968 errln("c < 0");
3969 break;
3970 }
3971 testText.append(c);
3972 }
3973
3974 // Calculate the expected results for this test string.
3975 mk.setText(testText);
3976 memset(expectedBreaks, 0, sizeof(expectedBreaks));
3977 expectedBreaks[0] = 1;
3978 int32_t breakPos = 0;
3979 expectedCount = 0;
3980 for (;;) {
3981 breakPos = mk.next(breakPos);
3982 if (breakPos == -1) {
3983 break;
3984 }
3985 if (breakPos > testText.length()) {
3986 errln("breakPos > testText.length()");
3987 }
3988 expectedBreaks[breakPos] = 1;
3989 U_ASSERT(expectedCount<testText.length());
3990 expected[expectedCount ++] = breakPos;
3991 }
3992
3993 // Find the break positions using forward iteration
3994 memset(forwardBreaks, 0, sizeof(forwardBreaks));
3995 if (useUText) {
3996 UErrorCode status = U_ZERO_ERROR;
3997 UText *testUText = utext_openReplaceable(NULL, &testText, &status);
3998 // testUText = utext_openUnicodeString(testUText, &testText, &status);
3999 bi->setText(testUText, status);
4000 TEST_ASSERT_SUCCESS(status);
4001 utext_close(testUText); // The break iterator does a shallow clone of the UText
4002 // This UText can be closed immediately, so long as the
4003 // testText string continues to exist.
4004 } else {
4005 bi->setText(testText);
4006 }
4007
4008 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4009 if (i < 0 || i > testText.length()) {
4010 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4011 break;
4012 }
4013 forwardBreaks[i] = 1;
4014 }
4015
4016 // Find the break positions using reverse iteration
4017 memset(reverseBreaks, 0, sizeof(reverseBreaks));
4018 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4019 if (i < 0 || i > testText.length()) {
4020 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4021 break;
4022 }
4023 reverseBreaks[i] = 1;
4024 }
4025
4026 // Find the break positions using isBoundary() tests.
4027 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4028 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4029 for (i=0; i<=testText.length(); i++) {
4030 isBoundaryBreaks[i] = bi->isBoundary(i);
4031 }
4032
4033
4034 // Find the break positions using the following() function.
4035 // printf(".");
4036 memset(followingBreaks, 0, sizeof(followingBreaks));
4037 int32_t lastBreakPos = 0;
4038 followingBreaks[0] = 1;
4039 for (i=0; i<testText.length(); i++) {
4040 breakPos = bi->following(i);
4041 if (breakPos <= i ||
4042 breakPos < lastBreakPos ||
4043 breakPos > testText.length() ||
4044 (breakPos > lastBreakPos && lastBreakPos > i)) {
4045 UChar32 brkChar = testText.char32At(lastBreakPos);
4046 if ((strcmp(name, "char") != 0 && strcmp(name, "word") != 0) || brkChar < 0x1F1E6 || brkChar > 0x1F1FF) { // Apple, skip RI char/word break monkey tests
4047 errln("%s break monkey test: "
4048 "Out of range value returned by BreakIterator::following().\n"
4049 "Random seed=%d index=%d; following returned %d; lastbreak=%d",
4050 name, seed, i, breakPos, lastBreakPos);
4051 }
4052 break;
4053 }
4054 followingBreaks[breakPos] = 1;
4055 lastBreakPos = breakPos;
4056 }
4057
4058 // Find the break positions using the preceding() function.
4059 memset(precedingBreaks, 0, sizeof(precedingBreaks));
4060 lastBreakPos = testText.length();
4061 precedingBreaks[testText.length()] = 1;
4062 for (i=testText.length(); i>0; i--) {
4063 breakPos = bi->preceding(i);
4064 if (breakPos >= i ||
4065 breakPos > lastBreakPos ||
4066 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4067 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4068 UChar32 brkChar = testText.char32At(breakPos);
4069 if ((strcmp(name, "char") != 0 && strcmp(name, "word") != 0) || brkChar < 0x1F1E6 || brkChar > 0x1F1FF) { // Apple, skip RI char/word break monkey tests
4070 errln("%s break monkey test: "
4071 "Out of range value returned by BreakIterator::preceding().\n"
4072 "index=%d; prev returned %d; lastBreak=%d" ,
4073 name, i, breakPos, lastBreakPos);
4074 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4075 precedingBreaks[i] = 2; // Forces an error.
4076 }
4077 }
4078 } else {
4079 if (breakPos >= 0) {
4080 precedingBreaks[breakPos] = 1;
4081 }
4082 lastBreakPos = breakPos;
4083 }
4084 }
4085
4086 // Compare the expected and actual results.
4087 for (i=0; i<=testText.length(); i++) {
4088 const char *errorType = NULL;
4089 if (forwardBreaks[i] != expectedBreaks[i]) {
4090 errorType = "next()";
4091 } else if (reverseBreaks[i] != forwardBreaks[i]) {
4092 errorType = "previous()";
4093 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4094 errorType = "isBoundary()";
4095 } else if (followingBreaks[i] != expectedBreaks[i]) {
4096 errorType = "following()";
4097 } else if (precedingBreaks[i] != expectedBreaks[i]) {
4098 errorType = "preceding()";
4099 }
4100
4101
4102 if (errorType != NULL) {
4103 // Format a range of the test text that includes the failure as
4104 // a data item that can be included in the rbbi test data file.
4105
4106 // Start of the range is the last point where expected and actual results
4107 // both agreed that there was a break position.
4108 int startContext = i;
4109 int32_t count = 0;
4110 for (;;) {
4111 if (startContext==0) { break; }
4112 startContext --;
4113 if (expectedBreaks[startContext] != 0) {
4114 if (count == 2) break;
4115 count ++;
4116 }
4117 }
4118
4119 // End of range is two expected breaks past the start position.
4120 int endContext = i + 1;
4121 int ci;
4122 for (ci=0; ci<2; ci++) { // Number of items to include in error text.
4123 for (;;) {
4124 if (endContext >= testText.length()) {break;}
4125 if (expectedBreaks[endContext-1] != 0) {
4126 if (count == 0) break;
4127 count --;
4128 }
4129 endContext ++;
4130 }
4131 }
4132
4133 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4134 UnicodeString errorText = "<data>";
4135 /***if (strcmp(errorType, "next()") == 0) {
4136 startContext = 0;
4137 endContext = testText.length();
4138
4139 printStringBreaks(testText, expected, expectedCount);
4140 }***/
4141
4142 for (ci=startContext; ci<endContext;) {
4143 UnicodeString hexChars("0123456789abcdef");
4144 UChar32 c;
4145 int bn;
4146 c = testText.char32At(ci);
4147 if (ci == i) {
4148 // This is the location of the error.
4149 errorText.append("<?>");
4150 } else if (expectedBreaks[ci] != 0) {
4151 // This a non-error expected break position.
4152 errorText.append("\\");
4153 }
4154 if (c < 0x10000) {
4155 errorText.append("\\u");
4156 for (bn=12; bn>=0; bn-=4) {
4157 errorText.append(hexChars.charAt((c>>bn)&0xf));
4158 }
4159 } else {
4160 errorText.append("\\U");
4161 for (bn=28; bn>=0; bn-=4) {
4162 errorText.append(hexChars.charAt((c>>bn)&0xf));
4163 }
4164 }
4165 ci = testText.moveIndex32(ci, 1);
4166 }
4167 errorText.append("\\");
4168 errorText.append("</data>\n");
4169
4170 // Output the error
4171 char charErrorTxt[500];
4172 UErrorCode status = U_ZERO_ERROR;
4173 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4174 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4175 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
4176
4177 UChar32 brkChar = testText.char32At(i);
4178 if ((strcmp(name, "char") != 0 && strcmp(name, "word") != 0) || brkChar < 0x1F1E6 || brkChar > 0x1F1FF) { // Apple, skip RI char/word break monkey tests
4179 errln("%s break monkey test error [%s]. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s",
4180 name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
4181 errorType, seed, i, charErrorTxt);
4182 }
4183 break;
4184 }
4185 }
4186
4187 loopCount++;
4188 }
4189 #endif
4190 }
4191
4192
4193 // Bug 5532. UTF-8 based UText fails in dictionary code.
4194 // This test checks the initial patch,
4195 // which is to just keep it from crashing. Correct word boundaries
4196 // await a proper fix to the dictionary code.
4197 //
4198 void RBBITest::TestBug5532(void) {
4199 // Text includes a mixture of Thai and Latin.
4200 const unsigned char utf8Data[] = {
4201 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4202 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4203 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4204 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4205 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4206 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4207 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4208 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4209 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4210 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4211 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4212
4213 UErrorCode status = U_ZERO_ERROR;
4214 UText utext=UTEXT_INITIALIZER;
4215 utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4216 TEST_ASSERT_SUCCESS(status);
4217
4218 BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4219 TEST_ASSERT_SUCCESS(status);
4220 if (U_SUCCESS(status)) {
4221 bi->setText(&utext, status);
4222 TEST_ASSERT_SUCCESS(status);
4223
4224 int32_t breakCount = 0;
4225 int32_t previousBreak = -1;
4226 for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4227 // For now, just make sure that the break iterator doesn't hang.
4228 TEST_ASSERT(previousBreak < bi->current());
4229 previousBreak = bi->current();
4230 }
4231 TEST_ASSERT(breakCount > 0);
4232 }
4233 delete bi;
4234 utext_close(&utext);
4235 }
4236
4237
4238 void RBBITest::TestBug9983(void) {
4239 UnicodeString text = UnicodeString("\\u002A" // * Other
4240 "\\uFF65" // Other
4241 "\\u309C" // Katakana
4242 "\\uFF9F" // Extend
4243 "\\uFF65" // Other
4244 "\\u0020" // Other
4245 "\\u0000").unescape();
4246
4247 UErrorCode status = U_ZERO_ERROR;
4248 LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4249 BreakIterator::createWordInstance(Locale::getRoot(), status)));
4250 TEST_ASSERT_SUCCESS(status);
4251 if (U_FAILURE(status)) {
4252 return;
4253 }
4254 brkiter->setText(text);
4255 int32_t offset, rstatus;
4256 brkiter->last();
4257 int32_t iterationCount = 0;
4258 while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4259 iterationCount++;
4260 rstatus = brkiter->getRuleStatus();
4261 // printf(" %d(%d)", offset, rstatus);
4262 if (iterationCount >= 10) {
4263 break;
4264 }
4265 }
4266 TEST_ASSERT(iterationCount == 6);
4267 }
4268
4269
4270 //
4271 // TestDebug - A place-holder test for debugging purposes.
4272 // For putting in fragments of other tests that can be invoked
4273 // for tracing without a lot of unwanted extra stuff happening.
4274 //
4275 void RBBITest::TestDebug(void) {
4276 #if 0
4277 UErrorCode status = U_ZERO_ERROR;
4278 int pos = 0;
4279 int ruleStatus = 0;
4280
4281 RuleBasedBreakIterator* bi =
4282 // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
4283 // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
4284 (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
4285 UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
4286 // UnicodeString s("Aaa. Bcd");
4287 s = s.unescape();
4288 bi->setText(s);
4289 UBool r = bi->isBoundary(8);
4290 printf("%s", r?"true":"false");
4291 return;
4292 pos = bi->last();
4293 do {
4294 // ruleStatus = bi->getRuleStatus();
4295 printf("%d\t%d\n", pos, ruleStatus);
4296 pos = bi->previous();
4297 } while (pos != BreakIterator::DONE);
4298 #endif
4299 }
4300
4301 void RBBITest::TestProperties() {
4302 UErrorCode errorCode = U_ZERO_ERROR;
4303 UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4304 if (!prependSet.isEmpty()) {
4305 errln(
4306 "[:GCB=Prepend:] is not empty any more. "
4307 "Uncomment relevant lines in source/data/brkitr/char.txt and "
4308 "change this test to the opposite condition.");
4309 }
4310 }
4311
4312 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */