]> git.saurik.com Git - apple/icu.git/blame - icuSources/test/intltest/rbbitst.cpp
ICU-531.48.tar.gz
[apple/icu.git] / icuSources / test / intltest / rbbitst.cpp
CommitLineData
73c04bcf
A
1/********************************************************************
2 * COPYRIGHT:
51004dcb 3 * Copyright (c) 1999-2013, International Business Machines Corporation and
73c04bcf
A
4 * others. All Rights Reserved.
5 ********************************************************************/
6/************************************************************************
7* Date Name Description
8* 12/15/99 Madhu Creation.
9* 01/12/2000 Madhu Updated for changed API and added new tests
10************************************************************************/
11
51004dcb 12#include "utypeinfo.h" // for 'typeid' to work
729e4ab9 13
73c04bcf
A
14#include "unicode/utypes.h"
15
16#if !UCONFIG_NO_BREAK_ITERATION
17
18#include "unicode/utypes.h"
19#include "unicode/brkiter.h"
20#include "unicode/rbbi.h"
21#include "unicode/uchar.h"
22#include "unicode/utf16.h"
23#include "unicode/ucnv.h"
24#include "unicode/schriter.h"
25#include "unicode/uniset.h"
4388f060
A
26#if !UCONFIG_NO_REGULAR_EXPRESSIONS
27#include "unicode/regex.h"
28#endif
73c04bcf
A
29#include "unicode/ustring.h"
30#include "unicode/utext.h"
31#include "intltest.h"
32#include "rbbitst.h"
33#include <string.h>
34#include "uvector.h"
35#include "uvectr32.h"
73c04bcf
A
36#include <string.h>
37#include <stdio.h>
38#include <stdlib.h>
51004dcb
A
39#include "unicode/numfmt.h"
40#include "unicode/uscript.h"
73c04bcf
A
41
42#define TEST_ASSERT(x) {if (!(x)) { \
43 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
44
46f4442e 45#define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
729e4ab9 46 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
73c04bcf
A
47
48
46f4442e
A
49//---------------------------------------------
50// runIndexedTest
51//---------------------------------------------
52
4388f060
A
53
54// Note: Before adding new tests to this file, check whether the desired test data can
55// simply be added to the file testdata/rbbitest.txt. In most cases it can,
56// it's much less work than writing a new test, diagnostic output in the event of failures
57// is good, and the test data file will is shared with ICU4J, so eventually the test
58// will run there as well, without additional effort.
59
46f4442e
A
60void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
61{
62 if (exec) logln("TestSuite RuleBasedBreakIterator: ");
63
64 switch (index) {
729e4ab9 65#if !UCONFIG_NO_FILE_IO
46f4442e
A
66 case 0: name = "TestBug4153072";
67 if(exec) TestBug4153072(); break;
729e4ab9
A
68#else
69 case 0: name = "skip";
70 break;
71#endif
72
4388f060
A
73 case 1: name = "skip";
74 break;
46f4442e
A
75 case 2: name = "TestStatusReturn";
76 if(exec) TestStatusReturn(); break;
729e4ab9
A
77
78#if !UCONFIG_NO_FILE_IO
46f4442e
A
79 case 3: name = "TestUnicodeFiles";
80 if(exec) TestUnicodeFiles(); break;
81 case 4: name = "TestEmptyString";
82 if(exec) TestEmptyString(); break;
729e4ab9
A
83#else
84 case 3: case 4: name = "skip";
85 break;
86#endif
46f4442e
A
87
88 case 5: name = "TestGetAvailableLocales";
89 if(exec) TestGetAvailableLocales(); break;
90
91 case 6: name = "TestGetDisplayName";
92 if(exec) TestGetDisplayName(); break;
93
729e4ab9 94#if !UCONFIG_NO_FILE_IO
46f4442e
A
95 case 7: name = "TestEndBehaviour";
96 if(exec) TestEndBehaviour(); break;
4388f060
A
97 case 8: case 9: case 10: name = "skip";
98 break;
46f4442e
A
99 case 11: name = "TestWordBreaks";
100 if(exec) TestWordBreaks(); break;
101 case 12: name = "TestWordBoundary";
102 if(exec) TestWordBoundary(); break;
103 case 13: name = "TestLineBreaks";
104 if(exec) TestLineBreaks(); break;
105 case 14: name = "TestSentBreaks";
106 if(exec) TestSentBreaks(); break;
107 case 15: name = "TestExtended";
108 if(exec) TestExtended(); break;
729e4ab9
A
109#else
110 case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip";
111 break;
112#endif
113
4388f060 114#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
51004dcb
A
115 case 16:
116 name = "TestMonkey"; if(exec) TestMonkey(params); break;
4388f060 117#else
729e4ab9 118 case 16:
4388f060
A
119 name = "skip"; break;
120#endif
729e4ab9
A
121
122#if !UCONFIG_NO_FILE_IO
46f4442e
A
123 case 17: name = "TestBug3818";
124 if(exec) TestBug3818(); break;
729e4ab9 125#else
4388f060 126 case 17: name = "skip";
729e4ab9
A
127 break;
128#endif
129
4388f060
A
130 case 18: name = "skip";
131 break;
46f4442e
A
132 case 19: name = "TestDebug";
133 if(exec) TestDebug(); break;
51004dcb
A
134 case 20: name = "skip";
135 break;
729e4ab9
A
136
137#if !UCONFIG_NO_FILE_IO
46f4442e 138 case 21: name = "TestBug5775";
729e4ab9 139 if (exec) TestBug5775(); break;
729e4ab9 140#else
4388f060 141 case 21: name = "skip";
729e4ab9
A
142 break;
143#endif
4388f060 144
51004dcb
A
145 case 22: name = "TestBug9983";
146 if (exec) TestBug9983(); break;
4388f060 147 case 23: name = "TestDictRules";
729e4ab9 148 if (exec) TestDictRules(); break;
4388f060 149 case 24: name = "TestBug5532";
729e4ab9 150 if (exec) TestBug5532(); break;
46f4442e
A
151 default: name = ""; break; //needed to end loop
152 }
153}
154
155
73c04bcf
A
156//---------------------------------------------------------------------------
157//
158// class BITestData Holds a set of Break iterator test data and results
159// Includes
160// - the string data to be broken
161// - a vector of the expected break positions.
162// - a vector of source line numbers for the data,
163// (to help see where errors occured.)
164// - The expected break tag values.
165// - Vectors of actual break positions and tag values.
166// - Functions for comparing actual with expected and
167// reporting errors.
168//
169//----------------------------------------------------------------------------
170class BITestData {
171public:
172 UnicodeString fDataToBreak;
173 UVector fExpectedBreakPositions;
174 UVector fExpectedTags;
175 UVector fLineNum;
176 UVector fActualBreakPositions; // Test Results.
177 UVector fActualTags;
178
179 BITestData(UErrorCode &status);
180 void addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
181 void checkResults(const char *heading, RBBITest *test);
182 void err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
183 void clearResults();
184};
185
186//
187// Constructor.
188//
189BITestData::BITestData(UErrorCode &status)
190: fExpectedBreakPositions(status), fExpectedTags(status), fLineNum(status), fActualBreakPositions(status),
191 fActualTags(status)
192{
193}
194
195//
196// addDataChunk. Add a section (non-breaking) piece if data to the test data.
197// The macro form collects the line number, which is helpful
198// when tracking down failures.
199//
200// A null data item is inserted at the start of each test's data
201// to put the starting zero into the data list. The position saved for
202// each non-null item is its ending position.
203//
204#define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status);
205void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
206 if (U_FAILURE(status)) {return;}
207 if (data != NULL) {
208 fDataToBreak.append(CharsToUnicodeString(data));
209 }
210 fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
211 fExpectedTags.addElement(tag, status);
212 fLineNum.addElement(lineNum, status);
213}
214
215
216//
217// checkResults. Compare the actual and expected break positions, report any differences.
218//
219void BITestData::checkResults(const char *heading, RBBITest *test) {
220 int32_t expectedIndex = 0;
221 int32_t actualIndex = 0;
222
223 for (;;) {
224 // If we've run through both the expected and actual results vectors, we're done.
225 // break out of the loop.
226 if (expectedIndex >= fExpectedBreakPositions.size() &&
227 actualIndex >= fActualBreakPositions.size()) {
228 break;
229 }
230
231
232 if (expectedIndex >= fExpectedBreakPositions.size()) {
233 err(heading, test, expectedIndex-1, actualIndex);
234 actualIndex++;
235 continue;
236 }
237
238 if (actualIndex >= fActualBreakPositions.size()) {
239 err(heading, test, expectedIndex, actualIndex-1);
240 expectedIndex++;
241 continue;
242 }
243
244 if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
245 err(heading, test, expectedIndex, actualIndex);
246 // Try to resync the positions of the indices, to avoid a rash of spurious erros.
247 if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
248 actualIndex++;
249 } else {
250 expectedIndex++;
251 }
252 continue;
253 }
254
255 if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
256 test->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d",
257 heading, fLineNum.elementAt(expectedIndex),
258 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
259 }
260
261 actualIndex++;
262 expectedIndex++;
263 }
264}
265
266//
267// err - An error was found. Report it, along with information about where the
268// incorrectly broken test data appeared in the source file.
269//
270void BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
271{
272 int32_t expected = fExpectedBreakPositions.elementAti(expectedIdx);
273 int32_t actual = fActualBreakPositions.elementAti(actualIdx);
274 int32_t o = 0;
275 int32_t line = fLineNum.elementAti(expectedIdx);
276 if (expectedIdx > 0) {
277 // The line numbers are off by one because a premature break occurs somewhere
278 // within the previous item, rather than at the start of the current (expected) item.
279 // We want to report the offset of the unexpected break from the start of
280 // this previous item.
281 o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
282 }
283 if (actual < expected) {
46f4442e 284 test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading, o, line, actual, expected);
73c04bcf 285 } else {
46f4442e 286 test->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading, line, actual, expected);
73c04bcf
A
287 }
288}
289
290
291void BITestData::clearResults() {
292 fActualBreakPositions.removeAllElements();
293 fActualTags.removeAllElements();
294}
295
296
73c04bcf
A
297//--------------------------------------------------------------------------------------
298//
299// RBBITest constructor and destructor
300//
301//--------------------------------------------------------------------------------------
302
303RBBITest::RBBITest() {
73c04bcf
A
304}
305
306
307RBBITest::~RBBITest() {
73c04bcf
A
308}
309
73c04bcf
A
310//-----------------------------------------------------------------------------------
311//
312// Test for status {tag} return value from break rules.
313// TODO: a more thorough test.
314//
315//-----------------------------------------------------------------------------------
316void RBBITest::TestStatusReturn() {
46f4442e 317 UnicodeString rulesString1("$Letters = [:L:];\n"
73c04bcf
A
318 "$Numbers = [:N:];\n"
319 "$Letters+{1};\n"
320 "$Numbers+{2};\n"
321 "Help\\ {4}/me\\!;\n"
322 "[^$Letters $Numbers];\n"
46f4442e 323 "!.*;\n", -1, US_INV);
73c04bcf
A
324 UnicodeString testString1 = "abc123..abc Help me Help me!";
325 // 01234567890123456789012345678
326 int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
327 int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1};
328
329 UErrorCode status=U_ZERO_ERROR;
330 UParseError parseError;
331
57a6839d 332 BreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
73c04bcf 333 if(U_FAILURE(status)) {
729e4ab9 334 dataerrln("FAIL : in construction - %s", u_errorName(status));
73c04bcf
A
335 } else {
336 int32_t pos;
337 int32_t i = 0;
338 bi->setText(testString1);
339 for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
340 if (pos != bounds1[i]) {
341 errln("FAIL: expected break at %d, got %d\n", bounds1[i], pos);
342 break;
343 }
344
345 int tag = bi->getRuleStatus();
346 if (tag != brkStatus[i]) {
347 errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
348 break;
349 }
350 i++;
351 }
352 }
353 delete bi;
354}
355
356
357static void printStringBreaks(UnicodeString ustr, int expected[],
358 int expectedcount)
359{
360 UErrorCode status = U_ZERO_ERROR;
361 char name[100];
362 printf("code alpha extend alphanum type word sent line name\n");
363 int j;
364 for (j = 0; j < ustr.length(); j ++) {
365 if (expectedcount > 0) {
366 int k;
367 for (k = 0; k < expectedcount; k ++) {
368 if (j == expected[k]) {
369 printf("------------------------------------------------ %d\n",
370 j);
371 }
372 }
373 }
374 UChar32 c = ustr.char32At(j);
375 if (c > 0xffff) {
376 j ++;
377 }
378 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
379 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
380 u_isUAlphabetic(c),
381 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
382 u_isalnum(c),
383 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
384 u_charType(c),
385 U_SHORT_PROPERTY_NAME),
386 u_getPropertyValueName(UCHAR_WORD_BREAK,
387 u_getIntPropertyValue(c,
388 UCHAR_WORD_BREAK),
389 U_SHORT_PROPERTY_NAME),
390 u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
391 u_getIntPropertyValue(c,
392 UCHAR_SENTENCE_BREAK),
393 U_SHORT_PROPERTY_NAME),
394 u_getPropertyValueName(UCHAR_LINE_BREAK,
395 u_getIntPropertyValue(c,
396 UCHAR_LINE_BREAK),
397 U_SHORT_PROPERTY_NAME),
398 name);
399 }
400}
401
73c04bcf
A
402
403void RBBITest::TestBug3818() {
404 UErrorCode status = U_ZERO_ERROR;
405
406 // Four Thai words...
407 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
408 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
409 UnicodeString thaiStr(thaiWordData);
410
57a6839d 411 BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
73c04bcf 412 if (U_FAILURE(status) || bi == NULL) {
729e4ab9 413 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
73c04bcf
A
414 return;
415 }
416 bi->setText(thaiStr);
417
418 int32_t startOfSecondWord = bi->following(1);
419 if (startOfSecondWord != 4) {
420 errln("Fail at file %s, line %d expected start of word at 4, got %d",
421 __FILE__, __LINE__, startOfSecondWord);
422 }
423 startOfSecondWord = bi->following(0);
424 if (startOfSecondWord != 4) {
425 errln("Fail at file %s, line %d expected start of word at 4, got %d",
426 __FILE__, __LINE__, startOfSecondWord);
427 }
428 delete bi;
429}
430
73c04bcf
A
431//----------------------------------------------------------------------------
432//
433// generalIteratorTest Given a break iterator and a set of test data,
434// Run the tests and report the results.
435//
436//----------------------------------------------------------------------------
437void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
438{
439
440 bi.setText(td.fDataToBreak);
441
442 testFirstAndNext(bi, td);
443
444 testLastAndPrevious(bi, td);
445
446 testFollowing(bi, td);
447 testPreceding(bi, td);
448 testIsBoundary(bi, td);
449 doMultipleSelectionTest(bi, td);
450}
451
452
453//
454// testFirstAndNext. Run the iterator forwards in the obvious first(), next()
455// kind of loop.
456//
457void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
458{
459 UErrorCode status = U_ZERO_ERROR;
460 int32_t p;
461 int32_t lastP = -1;
462 int32_t tag;
463
464 logln("Test first and next");
465 bi.setText(td.fDataToBreak);
466 td.clearResults();
467
468 for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
469 td.fActualBreakPositions.addElement(p, status); // Save result.
470 tag = bi.getRuleStatus();
471 td.fActualTags.addElement(tag, status);
472 if (p <= lastP) {
473 // If the iterator is not making forward progress, stop.
474 // No need to raise an error here, it'll be detected in the normal check of results.
475 break;
476 }
477 lastP = p;
478 }
479 td.checkResults("testFirstAndNext", this);
480}
481
482
483//
484// TestLastAndPrevious. Run the iterator backwards, starting with last().
485//
486void RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td)
487{
488 UErrorCode status = U_ZERO_ERROR;
489 int32_t p;
490 int32_t lastP = 0x7ffffffe;
491 int32_t tag;
492
46f4442e 493 logln("Test last and previous");
73c04bcf
A
494 bi.setText(td.fDataToBreak);
495 td.clearResults();
496
497 for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
498 // Save break position. Insert it at start of vector of results, shoving
499 // already-saved results further towards the end.
500 td.fActualBreakPositions.insertElementAt(p, 0, status);
501 // bi.previous(); // TODO: Why does this fix things up????
502 // bi.next();
503 tag = bi.getRuleStatus();
504 td.fActualTags.insertElementAt(tag, 0, status);
505 if (p >= lastP) {
506 // If the iterator is not making progress, stop.
507 // No need to raise an error here, it'll be detected in the normal check of results.
508 break;
509 }
510 lastP = p;
511 }
512 td.checkResults("testLastAndPrevious", this);
513}
514
515
516void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
517{
518 UErrorCode status = U_ZERO_ERROR;
519 int32_t p;
520 int32_t tag;
521 int32_t lastP = -2; // A value that will never be returned as a break position.
522 // cannot be -1; that is returned for DONE.
523 int i;
524
525 logln("testFollowing():");
526 bi.setText(td.fDataToBreak);
527 td.clearResults();
528
529 // Save the starting point, since we won't get that out of following.
530 p = bi.first();
531 td.fActualBreakPositions.addElement(p, status); // Save result.
532 tag = bi.getRuleStatus();
533 td.fActualTags.addElement(tag, status);
534
535 for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
536 p = bi.following(i);
537 if (p != lastP) {
538 if (p == RuleBasedBreakIterator::DONE) {
539 break;
540 }
541 // We've reached a new break position. Save it.
542 td.fActualBreakPositions.addElement(p, status); // Save result.
543 tag = bi.getRuleStatus();
544 td.fActualTags.addElement(tag, status);
545 lastP = p;
546 }
547 }
548 // The loop normally exits by means of the break in the middle.
549 // Make sure that the index was at the correct position for the break iterator to have
550 // returned DONE.
551 if (i != td.fDataToBreak.length()) {
552 errln("testFollowing(): iterator returned DONE prematurely.");
553 }
554
555 // Full check of all results.
556 td.checkResults("testFollowing", this);
557}
558
559
560
561void RBBITest::testPreceding(RuleBasedBreakIterator& bi, BITestData &td) {
562 UErrorCode status = U_ZERO_ERROR;
563 int32_t p;
564 int32_t tag;
565 int32_t lastP = 0x7ffffffe;
566 int i;
567
568 logln("testPreceding():");
569 bi.setText(td.fDataToBreak);
570 td.clearResults();
571
572 p = bi.last();
573 td.fActualBreakPositions.addElement(p, status);
574 tag = bi.getRuleStatus();
575 td.fActualTags.addElement(tag, status);
576
577 for (i = td.fDataToBreak.length(); i>=-1; i--) {
578 p = bi.preceding(i);
579 if (p != lastP) {
580 if (p == RuleBasedBreakIterator::DONE) {
581 break;
582 }
583 // We've reached a new break position. Save it.
584 td.fActualBreakPositions.insertElementAt(p, 0, status);
585 lastP = p;
586 tag = bi.getRuleStatus();
587 td.fActualTags.insertElementAt(tag, 0, status);
588 }
589 }
590 // The loop normally exits by means of the break in the middle.
591 // Make sure that the index was at the correct position for the break iterator to have
592 // returned DONE.
593 if (i != 0) {
594 errln("testPreceding(): iterator returned DONE prematurely.");
595 }
596
597 // Full check of all results.
598 td.checkResults("testPreceding", this);
599}
600
601
602
603void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td) {
604 UErrorCode status = U_ZERO_ERROR;
605 int i;
606 int32_t tag;
607
608 logln("testIsBoundary():");
609 bi.setText(td.fDataToBreak);
610 td.clearResults();
611
612 for (i = 0; i <= td.fDataToBreak.length(); i++) {
613 if (bi.isBoundary(i)) {
614 td.fActualBreakPositions.addElement(i, status); // Save result.
615 tag = bi.getRuleStatus();
616 td.fActualTags.addElement(tag, status);
617 }
618 }
619 td.checkResults("testIsBoundary: ", this);
620}
621
622
623
624void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
625{
626 iterator.setText(td.fDataToBreak);
627
628 RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
629 int32_t offset = iterator.first();
630 int32_t testOffset;
631 int32_t count = 0;
632
633 logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
634
635 if (*testIterator != iterator)
636 errln("clone() or operator!= failed: two clones compared unequal");
637
638 do {
639 testOffset = testIterator->first();
640 testOffset = testIterator->next(count);
641 if (offset != testOffset)
642 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
643
644 if (offset != RuleBasedBreakIterator::DONE) {
645 count++;
646 offset = iterator.next();
647
648 if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
649 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
650 if (count > 10000 || offset == -1) {
651 errln("operator== failed too many times. Stopping test.");
652 if (offset == -1) {
653 errln("Does (RuleBasedBreakIterator::DONE == -1)?");
654 }
655 return;
656 }
657 }
658 }
659 } while (offset != RuleBasedBreakIterator::DONE);
660
661 // now do it backwards...
662 offset = iterator.last();
663 count = 0;
664
665 do {
666 testOffset = testIterator->last();
667 testOffset = testIterator->next(count); // next() with a negative arg is same as previous
668 if (offset != testOffset)
669 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
670
671 if (offset != RuleBasedBreakIterator::DONE) {
672 count--;
673 offset = iterator.previous();
674 }
675 } while (offset != RuleBasedBreakIterator::DONE);
676
677 delete testIterator;
678}
679
680
681//---------------------------------------------
682//
683// other tests
684//
685//---------------------------------------------
686void RBBITest::TestEmptyString()
687{
688 UnicodeString text = "";
689 UErrorCode status = U_ZERO_ERROR;
690
691 BITestData x(status);
692 ADD_DATACHUNK(x, "", 0, status); // Break at start of data
693 RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
694 if (U_FAILURE(status))
695 {
729e4ab9 696 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
73c04bcf
A
697 return;
698 }
699 generalIteratorTest(*bi, x);
700 delete bi;
701}
702
703void RBBITest::TestGetAvailableLocales()
704{
705 int32_t locCount = 0;
706 const Locale* locList = BreakIterator::getAvailableLocales(locCount);
707
708 if (locCount == 0)
729e4ab9 709 dataerrln("getAvailableLocales() returned an empty list!");
73c04bcf
A
710 // Just make sure that it's returning good memory.
711 int32_t i;
712 for (i = 0; i < locCount; ++i) {
713 logln(locList[i].getName());
714 }
715}
716
717//Testing the BreakIterator::getDisplayName() function
718void RBBITest::TestGetDisplayName()
719{
720 UnicodeString result;
721
722 BreakIterator::getDisplayName(Locale::getUS(), result);
723 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
729e4ab9 724 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
73c04bcf
A
725 + result);
726
727 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
728 if (result != "French (France)")
729e4ab9 729 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
73c04bcf
A
730 + result);
731}
732/**
733 * Test End Behaviour
734 * @bug 4068137
735 */
736void RBBITest::TestEndBehaviour()
737{
738 UErrorCode status = U_ZERO_ERROR;
739 UnicodeString testString("boo.");
740 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
741 if (U_FAILURE(status))
742 {
729e4ab9 743 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
73c04bcf
A
744 return;
745 }
746 wb->setText(testString);
747
748 if (wb->first() != 0)
749 errln("Didn't get break at beginning of string.");
750 if (wb->next() != 3)
751 errln("Didn't get break before period in \"boo.\"");
752 if (wb->current() != 4 && wb->next() != 4)
753 errln("Didn't get break at end of string.");
754 delete wb;
755}
756/*
757 * @bug 4153072
758 */
759void RBBITest::TestBug4153072() {
760 UErrorCode status = U_ZERO_ERROR;
761 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
762 if (U_FAILURE(status))
763 {
729e4ab9 764 errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
73c04bcf
A
765 return;
766 }
767 UnicodeString str("...Hello, World!...");
768 int32_t begin = 3;
769 int32_t end = str.length() - 3;
770 UBool onBoundary;
771
772 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
773 iter->adoptText(textIterator);
774 int index;
775 // Note: with the switch to UText, there is no way to restrict the
776 // iteration range to begin at an index other than zero.
777 // String character iterators created with a non-zero bound are
778 // treated by RBBI as being empty.
779 for (index = -1; index < begin + 1; ++index) {
780 onBoundary = iter->isBoundary(index);
781 if (index == 0? !onBoundary : onBoundary) {
782 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
783 " and begin index = " + begin);
784 }
785 }
786 delete iter;
787}
788
789
46f4442e
A
790//
791// Test for problem reported by Ashok Matoria on 9 July 2007
792// One.<kSoftHyphen><kSpace>Two.
793//
794// Sentence break at start (0) and then on calling next() it breaks at
795// 'T' of "Two". Now, at this point if I do next() and
796// then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
797//
798void RBBITest::TestBug5775() {
799 UErrorCode status = U_ZERO_ERROR;
800 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
801 TEST_ASSERT_SUCCESS(status);
729e4ab9
A
802 if (U_FAILURE(status)) {
803 return;
804 }
805// Check for status first for better handling of no data errors.
46f4442e 806 TEST_ASSERT(bi != NULL);
729e4ab9 807 if (bi == NULL) {
46f4442e
A
808 return;
809 }
729e4ab9 810
46f4442e
A
811 UnicodeString s("One.\\u00ad Two.", -1, US_INV);
812 // 01234 56789
813 s = s.unescape();
814 bi->setText(s);
815 int pos = bi->next();
816 TEST_ASSERT(pos == 6);
817 pos = bi->next();
818 TEST_ASSERT(pos == 10);
819 pos = bi->previous();
820 TEST_ASSERT(pos == 6);
821 delete bi;
822}
823
824
825
73c04bcf
A
826//------------------------------------------------------------------------------
827//
828// RBBITest::Extended Run RBBI Tests from an external test data file
829//
830//------------------------------------------------------------------------------
831
832struct TestParams {
833 BreakIterator *bi;
834 UnicodeString dataToBreak;
835 UVector32 *expectedBreaks;
836 UVector32 *srcLine;
837 UVector32 *srcCol;
838};
839
840void RBBITest::executeTest(TestParams *t) {
841 int32_t bp;
842 int32_t prevBP;
843 int32_t i;
844
845 if (t->bi == NULL) {
846 return;
847 }
848
849 t->bi->setText(t->dataToBreak);
850 //
851 // Run the iterator forward
852 //
853 prevBP = -1;
854 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
855 if (prevBP == bp) {
856 // Fail for lack of forward progress.
857 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
858 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
859 break;
860 }
861
862 // Check that there were we didn't miss an expected break between the last one
863 // and this one.
864 for (i=prevBP+1; i<bp; i++) {
865 if (t->expectedBreaks->elementAti(i) != 0) {
866 int expected[] = {0, i};
867 printStringBreaks(t->dataToBreak, expected, 2);
868 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
869 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
870 }
871 }
872
873 // Check that the break we did find was expected
874 if (t->expectedBreaks->elementAti(bp) == 0) {
875 int expected[] = {0, bp};
876 printStringBreaks(t->dataToBreak, expected, 2);
877 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
878 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
879 } else {
880 // The break was expected.
881 // Check that the {nnn} tag value is correct.
882 int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
883 if (expectedTagVal == -1) {
884 expectedTagVal = 0;
885 }
886 int32_t line = t->srcLine->elementAti(bp);
887 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
888 if (rs != expectedTagVal) {
889 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
890 " Actual, Expected status = %4d, %4d",
891 bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
892 }
893 }
894
895
896 prevBP = bp;
897 }
898
899 // Verify that there were no missed expected breaks after the last one found
900 for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
901 if (t->expectedBreaks->elementAti(i) != 0) {
902 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
903 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
904 }
905 }
906
907 //
908 // Run the iterator backwards, verify that the same breaks are found.
909 //
910 prevBP = t->dataToBreak.length()+2; // start with a phony value for the last break pos seen.
911 for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
912 if (prevBP == bp) {
913 // Fail for lack of progress.
914 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
915 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
916 break;
917 }
918
919 // Check that there were we didn't miss an expected break between the last one
920 // and this one. (UVector returns zeros for index out of bounds.)
921 for (i=prevBP-1; i>bp; i--) {
922 if (t->expectedBreaks->elementAti(i) != 0) {
923 errln("Reverse Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
924 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
925 }
926 }
927
928 // Check that the break we did find was expected
929 if (t->expectedBreaks->elementAti(bp) == 0) {
930 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
931 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
932 } else {
933 // The break was expected.
934 // Check that the {nnn} tag value is correct.
935 int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
936 if (expectedTagVal == -1) {
937 expectedTagVal = 0;
938 }
939 int line = t->srcLine->elementAti(bp);
940 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
941 if (rs != expectedTagVal) {
942 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
943 " Actual, Expected status = %4d, %4d",
944 bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
945 }
946 }
947
948 prevBP = bp;
949 }
950
951 // Verify that there were no missed breaks prior to the last one found
952 for (i=prevBP-1; i>=0; i--) {
953 if (t->expectedBreaks->elementAti(i) != 0) {
954 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
955 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
956 }
957 }
51004dcb
A
958
959 // Check isBoundary()
960 for (i=0; i<t->expectedBreaks->size(); i++) {
961 UBool boundaryExpected = (t->expectedBreaks->elementAti(i) != 0);
962 UBool boundaryFound = t->bi->isBoundary(i);
963 if (boundaryExpected != boundaryFound) {
964 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
965 " Expected, Actual= %s, %s",
966 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i),
967 boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
968 }
969 }
970
971 // Check following()
972 for (i=0; i<t->expectedBreaks->size(); i++) {
973 int32_t actualBreak = t->bi->following(i);
974 int32_t expectedBreak = BreakIterator::DONE;
975 for (int32_t j=i+1; j < t->expectedBreaks->size(); j++) {
976 if (t->expectedBreaks->elementAti(j) != 0) {
977 expectedBreak = j;
978 break;
979 }
980 }
981 if (expectedBreak != actualBreak) {
982 errln("following(%d) incorrect. File line,col= %4d,%4d\n"
983 " Expected, Actual= %d, %d",
984 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i), expectedBreak, actualBreak);
985 }
986 }
987
988 // Check preceding()
989 for (i=t->expectedBreaks->size(); i>=0; i--) {
990 int32_t actualBreak = t->bi->preceding(i);
991 int32_t expectedBreak = BreakIterator::DONE;
992
993 for (int32_t j=i-1; j >= 0; j--) {
994 if (t->expectedBreaks->elementAti(j) != 0) {
995 expectedBreak = j;
996 break;
997 }
998 }
999 if (expectedBreak != actualBreak) {
1000 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
1001 " Expected, Actual= %d, %d",
1002 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i), expectedBreak, actualBreak);
1003 }
1004 }
73c04bcf
A
1005}
1006
1007
1008void RBBITest::TestExtended() {
1009#if !UCONFIG_NO_REGULAR_EXPRESSIONS
1010 UErrorCode status = U_ZERO_ERROR;
1011 Locale locale("");
1012
1013 UnicodeString rules;
1014 TestParams tp;
1015 tp.bi = NULL;
1016 tp.expectedBreaks = new UVector32(status);
1017 tp.srcLine = new UVector32(status);
1018 tp.srcCol = new UVector32(status);
1019
08b89b0a 1020 RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_@=-]*) *>"), 0, status);
729e4ab9
A
1021 if (U_FAILURE(status)) {
1022 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
1023 }
73c04bcf
A
1024
1025
1026 //
1027 // Open and read the test data file.
1028 //
1029 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1030 char testFileName[1000];
1031 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1032 errln("Can't open test data. Path too long.");
1033 return;
1034 }
1035 strcpy(testFileName, testDataDirectory);
1036 strcat(testFileName, "rbbitst.txt");
1037
1038 int len;
46f4442e 1039 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
73c04bcf
A
1040 if (U_FAILURE(status)) {
1041 return; /* something went wrong, error already output */
1042 }
1043
1044
1045
46f4442e 1046
73c04bcf
A
1047 //
1048 // Put the test data into a UnicodeString
1049 //
1050 UnicodeString testString(FALSE, testFile, len);
1051
1052 enum EParseState{
1053 PARSE_COMMENT,
1054 PARSE_TAG,
1055 PARSE_DATA,
1056 PARSE_NUM
1057 }
1058 parseState = PARSE_TAG;
1059
1060 EParseState savedState = PARSE_TAG;
1061
1062 static const UChar CH_LF = 0x0a;
1063 static const UChar CH_CR = 0x0d;
1064 static const UChar CH_HASH = 0x23;
1065 /*static const UChar CH_PERIOD = 0x2e;*/
1066 static const UChar CH_LT = 0x3c;
1067 static const UChar CH_GT = 0x3e;
1068 static const UChar CH_BACKSLASH = 0x5c;
1069 static const UChar CH_BULLET = 0x2022;
1070
1071 int32_t lineNum = 1;
1072 int32_t colStart = 0;
1073 int32_t column = 0;
1074 int32_t charIdx = 0;
1075
1076 int32_t tagValue = 0; // The numeric value of a <nnn> tag.
1077
1078 for (charIdx = 0; charIdx < len; ) {
1079 status = U_ZERO_ERROR;
1080 UChar c = testString.charAt(charIdx);
1081 charIdx++;
1082 if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
1083 // treat CRLF as a unit
1084 c = CH_LF;
1085 charIdx++;
1086 }
1087 if (c == CH_LF || c == CH_CR) {
1088 lineNum++;
1089 colStart = charIdx;
1090 }
1091 column = charIdx - colStart + 1;
1092
1093 switch (parseState) {
1094 case PARSE_COMMENT:
1095 if (c == 0x0a || c == 0x0d) {
1096 parseState = savedState;
1097 }
1098 break;
1099
1100 case PARSE_TAG:
1101 {
1102 if (c == CH_HASH) {
1103 parseState = PARSE_COMMENT;
1104 savedState = PARSE_TAG;
1105 break;
1106 }
1107 if (u_isUWhiteSpace(c)) {
1108 break;
1109 }
1110 if (testString.compare(charIdx-1, 6, "<word>") == 0) {
1111 delete tp.bi;
1112 tp.bi = BreakIterator::createWordInstance(locale, status);
1113 charIdx += 5;
1114 break;
1115 }
1116 if (testString.compare(charIdx-1, 6, "<char>") == 0) {
1117 delete tp.bi;
1118 tp.bi = BreakIterator::createCharacterInstance(locale, status);
1119 charIdx += 5;
1120 break;
1121 }
1122 if (testString.compare(charIdx-1, 6, "<line>") == 0) {
1123 delete tp.bi;
1124 tp.bi = BreakIterator::createLineInstance(locale, status);
1125 charIdx += 5;
1126 break;
1127 }
1128 if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
1129 delete tp.bi;
1130 tp.bi = NULL;
46f4442e 1131 tp.bi = BreakIterator::createSentenceInstance(locale, status);
73c04bcf
A
1132 charIdx += 5;
1133 break;
1134 }
1135 if (testString.compare(charIdx-1, 7, "<title>") == 0) {
1136 delete tp.bi;
1137 tp.bi = BreakIterator::createTitleInstance(locale, status);
1138 charIdx += 6;
1139 break;
1140 }
46f4442e 1141
73c04bcf
A
1142 // <locale loc_name>
1143 localeMatcher.reset(testString);
1144 if (localeMatcher.lookingAt(charIdx-1, status)) {
1145 UnicodeString localeName = localeMatcher.group(1, status);
1146 char localeName8[100];
1147 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
1148 locale = Locale::createFromName(localeName8);
51004dcb 1149 charIdx += localeMatcher.group(0, status).length() - 1;
73c04bcf
A
1150 TEST_ASSERT_SUCCESS(status);
1151 break;
1152 }
1153 if (testString.compare(charIdx-1, 6, "<data>") == 0) {
1154 parseState = PARSE_DATA;
1155 charIdx += 5;
1156 tp.dataToBreak = "";
1157 tp.expectedBreaks->removeAllElements();
1158 tp.srcCol ->removeAllElements();
1159 tp.srcLine->removeAllElements();
1160 break;
1161 }
1162
1163 errln("line %d: Tag expected in test file.", lineNum);
73c04bcf
A
1164 parseState = PARSE_COMMENT;
1165 savedState = PARSE_DATA;
46f4442e 1166 goto end_test; // Stop the test.
73c04bcf
A
1167 }
1168 break;
1169
1170 case PARSE_DATA:
1171 if (c == CH_BULLET) {
1172 int32_t breakIdx = tp.dataToBreak.length();
1173 tp.expectedBreaks->setSize(breakIdx+1);
1174 tp.expectedBreaks->setElementAt(-1, breakIdx);
1175 tp.srcLine->setSize(breakIdx+1);
1176 tp.srcLine->setElementAt(lineNum, breakIdx);
1177 tp.srcCol ->setSize(breakIdx+1);
1178 tp.srcCol ->setElementAt(column, breakIdx);
1179 break;
1180 }
1181
1182 if (testString.compare(charIdx-1, 7, "</data>") == 0) {
1183 // Add final entry to mappings from break location to source file position.
1184 // Need one extra because last break position returned is after the
1185 // last char in the data, not at the last char.
1186 tp.srcLine->addElement(lineNum, status);
1187 tp.srcCol ->addElement(column, status);
1188
1189 parseState = PARSE_TAG;
1190 charIdx += 6;
1191
1192 // RUN THE TEST!
1193 executeTest(&tp);
1194 break;
1195 }
1196
46f4442e 1197 if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
73c04bcf
A
1198 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1199 // Get the code point from the name and insert it into the test data.
1200 // (Damn, no API takes names in Unicode !!!
1201 // we've got to take it back to char *)
1202 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
1203 int32_t nameLength = nameEndIdx - (charIdx+2);
1204 char charNameBuf[200];
1205 UChar32 theChar = -1;
1206 if (nameEndIdx != -1) {
1207 UErrorCode status = U_ZERO_ERROR;
1208 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1209 charNameBuf[sizeof(charNameBuf)-1] = 0;
1210 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1211 if (U_FAILURE(status)) {
1212 theChar = -1;
1213 }
1214 }
1215 if (theChar == -1) {
1216 errln("Error in named character in test file at line %d, col %d",
1217 lineNum, column);
1218 } else {
1219 // Named code point was recognized. Insert it
1220 // into the test data.
1221 tp.dataToBreak.append(theChar);
1222 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1223 tp.srcLine->addElement(lineNum, status);
1224 tp.srcCol ->addElement(column, status);
1225 }
1226 }
1227 if (nameEndIdx > charIdx) {
1228 charIdx = nameEndIdx+1;
1229
1230 }
1231 break;
1232 }
1233
1234
1235
1236
1237 if (testString.compare(charIdx-1, 2, "<>") == 0) {
1238 charIdx++;
1239 int32_t breakIdx = tp.dataToBreak.length();
1240 tp.expectedBreaks->setSize(breakIdx+1);
1241 tp.expectedBreaks->setElementAt(-1, breakIdx);
1242 tp.srcLine->setSize(breakIdx+1);
1243 tp.srcLine->setElementAt(lineNum, breakIdx);
1244 tp.srcCol ->setSize(breakIdx+1);
1245 tp.srcCol ->setElementAt(column, breakIdx);
1246 break;
1247 }
1248
1249 if (c == CH_LT) {
1250 tagValue = 0;
1251 parseState = PARSE_NUM;
1252 break;
1253 }
1254
1255 if (c == CH_HASH && column==3) { // TODO: why is column off so far?
1256 parseState = PARSE_COMMENT;
1257 savedState = PARSE_DATA;
1258 break;
1259 }
1260
1261 if (c == CH_BACKSLASH) {
1262 // Check for \ at end of line, a line continuation.
1263 // Advance over (discard) the newline
1264 UChar32 cp = testString.char32At(charIdx);
1265 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
1266 // We have a CR LF
1267 // Need an extra increment of the input ptr to move over both of them
1268 charIdx++;
1269 }
1270 if (cp == CH_LF || cp == CH_CR) {
1271 lineNum++;
1272 colStart = charIdx;
1273 charIdx++;
1274 break;
1275 }
1276
1277 // Let unescape handle the back slash.
1278 cp = testString.unescapeAt(charIdx);
1279 if (cp != -1) {
1280 // Escape sequence was recognized. Insert the char
1281 // into the test data.
1282 tp.dataToBreak.append(cp);
1283 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1284 tp.srcLine->addElement(lineNum, status);
1285 tp.srcCol ->addElement(column, status);
1286 }
1287 break;
1288 }
1289
1290
1291 // Not a recognized backslash escape sequence.
1292 // Take the next char as a literal.
1293 // TODO: Should this be an error?
1294 c = testString.charAt(charIdx);
1295 charIdx = testString.moveIndex32(charIdx, 1);
1296 }
1297
1298 // Normal, non-escaped data char.
1299 tp.dataToBreak.append(c);
1300
1301 // Save the mapping from offset in the data to line/column numbers in
1302 // the original input file. Will be used for better error messages only.
1303 // If there's an expected break before this char, the slot in the mapping
1304 // vector will already be set for this char; don't overwrite it.
1305 if (tp.dataToBreak.length() > tp.srcLine->size()) {
1306 tp.srcLine->addElement(lineNum, status);
1307 tp.srcCol ->addElement(column, status);
1308 }
1309 break;
1310
1311
1312 case PARSE_NUM:
1313 // We are parsing an expected numeric tag value, like <1234>,
1314 // within a chunk of data.
1315 if (u_isUWhiteSpace(c)) {
1316 break;
1317 }
1318
1319 if (c == CH_GT) {
1320 // Finished the number. Add the info to the expected break data,
1321 // and switch parse state back to doing plain data.
1322 parseState = PARSE_DATA;
1323 if (tagValue == 0) {
1324 tagValue = -1;
1325 }
1326 int32_t breakIdx = tp.dataToBreak.length();
1327 tp.expectedBreaks->setSize(breakIdx+1);
1328 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1329 tp.srcLine->setSize(breakIdx+1);
1330 tp.srcLine->setElementAt(lineNum, breakIdx);
1331 tp.srcCol ->setSize(breakIdx+1);
1332 tp.srcCol ->setElementAt(column, breakIdx);
1333 break;
1334 }
1335
1336 if (u_isdigit(c)) {
1337 tagValue = tagValue*10 + u_charDigitValue(c);
1338 break;
1339 }
1340
1341 errln("Syntax Error in test file at line %d, col %d",
1342 lineNum, column);
73c04bcf 1343 parseState = PARSE_COMMENT;
46f4442e 1344 goto end_test; // Stop the test
73c04bcf
A
1345 break;
1346 }
1347
1348
1349 if (U_FAILURE(status)) {
4388f060 1350 dataerrln("ICU Error %s while parsing test file at line %d.",
73c04bcf 1351 u_errorName(status), lineNum);
73c04bcf 1352 status = U_ZERO_ERROR;
46f4442e 1353 goto end_test; // Stop the test
73c04bcf
A
1354 }
1355
1356 }
1357
1358end_test:
1359 delete tp.bi;
1360 delete tp.expectedBreaks;
1361 delete tp.srcLine;
1362 delete tp.srcCol;
1363 delete [] testFile;
1364#endif
1365}
1366
729e4ab9
A
1367
1368//-------------------------------------------------------------------------------
1369//
1370// TestDictRules create a break iterator from source rules that includes a
1371// dictionary range. Regression for bug #7130. Source rules
1372// do not declare a break iterator type (word, line, sentence, etc.
1373// but the dictionary code, without a type, would loop.
1374//
1375//-------------------------------------------------------------------------------
1376void RBBITest::TestDictRules() {
1377 const char *rules = "$dictionary = [a-z]; \n"
1378 "!!forward; \n"
1379 "$dictionary $dictionary; \n"
1380 "!!reverse; \n"
1381 "$dictionary $dictionary; \n";
1382 const char *text = "aa";
1383 UErrorCode status = U_ZERO_ERROR;
1384 UParseError parseError;
1385
1386 RuleBasedBreakIterator bi(rules, parseError, status);
1387 if (U_SUCCESS(status)) {
1388 UnicodeString utext = text;
1389 bi.setText(utext);
1390 int32_t position;
1391 int32_t loops;
1392 for (loops = 0; loops<10; loops++) {
1393 position = bi.next();
1394 if (position == RuleBasedBreakIterator::DONE) {
1395 break;
1396 }
1397 }
1398 TEST_ASSERT(loops == 1);
1399 } else {
1400 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1401 }
1402}
1403
1404
73c04bcf
A
1405
1406//-------------------------------------------------------------------------------
1407//
1408// ReadAndConvertFile Read a text data file, convert it to UChars, and
1409// return the datain one big UChar * buffer, which the caller must delete.
1410//
46f4442e
A
1411// parameters:
1412// fileName: the name of the file, with no directory part. The test data directory
1413// is assumed.
1414// ulen an out parameter, receives the actual length (in UChars) of the file data.
1415// encoding The file encoding. If the file contains a BOM, that will override the encoding
1416// specified here. The BOM, if it exists, will be stripped from the returned data.
1417// Pass NULL for the system default encoding.
1418// status
1419// returns:
1420// The file data, converted to UChar.
1421// The caller must delete this when done with
1422// delete [] theBuffer;
1423//
73c04bcf
A
1424// TODO: This is a clone of RegexTest::ReadAndConvertFile.
1425// Move this function to some common place.
1426//
1427//--------------------------------------------------------------------------------
46f4442e 1428UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
73c04bcf
A
1429 UChar *retPtr = NULL;
1430 char *fileBuf = NULL;
1431 UConverter* conv = NULL;
1432 FILE *f = NULL;
1433
1434 ulen = 0;
1435 if (U_FAILURE(status)) {
1436 return retPtr;
1437 }
1438
1439 //
1440 // Open the file.
1441 //
1442 f = fopen(fileName, "rb");
1443 if (f == 0) {
729e4ab9 1444 dataerrln("Error opening test data file %s\n", fileName);
73c04bcf
A
1445 status = U_FILE_ACCESS_ERROR;
1446 return NULL;
1447 }
1448 //
1449 // Read it in
1450 //
1451 int fileSize;
1452 int amt_read;
1453
1454 fseek( f, 0, SEEK_END);
1455 fileSize = ftell(f);
1456 fileBuf = new char[fileSize];
1457 fseek(f, 0, SEEK_SET);
1458 amt_read = fread(fileBuf, 1, fileSize, f);
1459 if (amt_read != fileSize || fileSize <= 0) {
1460 errln("Error reading test data file.");
1461 goto cleanUpAndReturn;
1462 }
1463
1464 //
1465 // Look for a Unicode Signature (BOM) on the data just read
1466 //
1467 int32_t signatureLength;
1468 const char * fileBufC;
46f4442e 1469 const char* bomEncoding;
73c04bcf
A
1470
1471 fileBufC = fileBuf;
46f4442e 1472 bomEncoding = ucnv_detectUnicodeSignature(
73c04bcf 1473 fileBuf, fileSize, &signatureLength, &status);
46f4442e 1474 if(bomEncoding!=NULL ){
73c04bcf
A
1475 fileBufC += signatureLength;
1476 fileSize -= signatureLength;
46f4442e 1477 encoding = bomEncoding;
73c04bcf
A
1478 }
1479
1480 //
1481 // Open a converter to take the rule file to UTF-16
1482 //
1483 conv = ucnv_open(encoding, &status);
1484 if (U_FAILURE(status)) {
1485 goto cleanUpAndReturn;
1486 }
1487
1488 //
1489 // Convert the rules to UChar.
1490 // Preflight first to determine required buffer size.
1491 //
1492 ulen = ucnv_toUChars(conv,
1493 NULL, // dest,
1494 0, // destCapacity,
1495 fileBufC,
1496 fileSize,
1497 &status);
1498 if (status == U_BUFFER_OVERFLOW_ERROR) {
1499 // Buffer Overflow is expected from the preflight operation.
1500 status = U_ZERO_ERROR;
1501
1502 retPtr = new UChar[ulen+1];
1503 ucnv_toUChars(conv,
1504 retPtr, // dest,
1505 ulen+1,
1506 fileBufC,
1507 fileSize,
1508 &status);
1509 }
1510
1511cleanUpAndReturn:
1512 fclose(f);
1513 delete []fileBuf;
1514 ucnv_close(conv);
1515 if (U_FAILURE(status)) {
1516 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4388f060 1517 delete []retPtr;
73c04bcf
A
1518 retPtr = 0;
1519 ulen = 0;
1520 };
1521 return retPtr;
1522}
1523
1524
73c04bcf 1525
46f4442e 1526//--------------------------------------------------------------------------------------------
73c04bcf 1527//
46f4442e 1528// Run tests from each of the boundary test data files distributed by the Unicode Consortium
73c04bcf 1529//
46f4442e
A
1530//-------------------------------------------------------------------------------------------
1531void RBBITest::TestUnicodeFiles() {
1532 RuleBasedBreakIterator *bi;
1533 UErrorCode status = U_ZERO_ERROR;
73c04bcf 1534
729e4ab9 1535 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
46f4442e
A
1536 TEST_ASSERT_SUCCESS(status);
1537 if (U_SUCCESS(status)) {
1538 runUnicodeTestData("GraphemeBreakTest.txt", bi);
1539 }
1540 delete bi;
73c04bcf 1541
729e4ab9 1542 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
46f4442e
A
1543 TEST_ASSERT_SUCCESS(status);
1544 if (U_SUCCESS(status)) {
1545 runUnicodeTestData("WordBreakTest.txt", bi);
1546 }
1547 delete bi;
73c04bcf 1548
729e4ab9 1549 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
46f4442e
A
1550 TEST_ASSERT_SUCCESS(status);
1551 if (U_SUCCESS(status)) {
1552 runUnicodeTestData("SentenceBreakTest.txt", bi);
1553 }
1554 delete bi;
73c04bcf 1555
729e4ab9 1556 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
46f4442e
A
1557 TEST_ASSERT_SUCCESS(status);
1558 if (U_SUCCESS(status)) {
1559 runUnicodeTestData("LineBreakTest.txt", bi);
73c04bcf 1560 }
46f4442e 1561 delete bi;
73c04bcf
A
1562}
1563
1564
46f4442e
A
1565//--------------------------------------------------------------------------------------------
1566//
1567// Run tests from one of the boundary test data files distributed by the Unicode Consortium
1568//
1569//-------------------------------------------------------------------------------------------
1570void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1571#if !UCONFIG_NO_REGULAR_EXPRESSIONS
4388f060 1572 // TODO(andy): Match line break behavior to Unicode 6.0 and remove this time bomb. Ticket #7270
57a6839d 1573 UBool isTicket7270Fixed = !logKnownIssue("7270");
4388f060 1574 UBool isLineBreak = 0 == strcmp(fileName, "LineBreakTest.txt");
46f4442e 1575 UErrorCode status = U_ZERO_ERROR;
73c04bcf 1576
46f4442e
A
1577 //
1578 // Open and read the test data file, put it into a UnicodeString.
1579 //
1580 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1581 char testFileName[1000];
1582 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
729e4ab9 1583 dataerrln("Can't open test data. Path too long.");
73c04bcf
A
1584 return;
1585 }
46f4442e
A
1586 strcpy(testFileName, testDataDirectory);
1587 strcat(testFileName, fileName);
1588
1589 logln("Opening data file %s\n", fileName);
73c04bcf 1590
46f4442e
A
1591 int len;
1592 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1593 if (status != U_FILE_ACCESS_ERROR) {
1594 TEST_ASSERT_SUCCESS(status);
1595 TEST_ASSERT(testFile != NULL);
1596 }
1597 if (U_FAILURE(status) || testFile == NULL) {
1598 return; /* something went wrong, error already output */
1599 }
1600 UnicodeString testFileAsString(TRUE, testFile, len);
73c04bcf 1601
46f4442e
A
1602 //
1603 // Parse the test data file using a regular expression.
1604 // Each kind of token is recognized in its own capture group; what type of item was scanned
1605 // is identified by which group had a match.
1606 //
1607 // Caputure Group # 1 2 3 4 5
1608 // Parses this item: divide x hex digits comment \n unrecognized \n
1609 //
1610 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1611 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1612 UnicodeString testString;
1613 UVector32 breakPositions(status);
1614 int lineNumber = 1;
1615 TEST_ASSERT_SUCCESS(status);
1616 if (U_FAILURE(status)) {
73c04bcf
A
1617 return;
1618 }
1619
46f4442e
A
1620 //
1621 // Scan through each test case, building up the string to be broken in testString,
1622 // and the positions that should be boundaries in the breakPositions vector.
1623 //
729e4ab9 1624 int spin = 0;
46f4442e 1625 while (tokenMatcher.find()) {
729e4ab9
A
1626 if(tokenMatcher.hitEnd()) {
1627 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for.
1628 This occurred when the text file was corrupt (wasn't marked as UTF-8)
1629 and caused an infinite loop here on EBCDIC systems!
1630 */
1631 fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1632 // return;
1633 }
46f4442e
A
1634 if (tokenMatcher.start(1, status) >= 0) {
1635 // Scanned a divide sign, indicating a break position in the test data.
1636 if (testString.length()>0) {
1637 breakPositions.addElement(testString.length(), status);
73c04bcf 1638 }
46f4442e
A
1639 }
1640 else if (tokenMatcher.start(2, status) >= 0) {
1641 // Scanned an 'x', meaning no break at this position in the test data
1642 // Nothing to be done here.
1643 }
1644 else if (tokenMatcher.start(3, status) >= 0) {
1645 // Scanned Hex digits. Convert them to binary, append to the character data string.
1646 const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1647 int length = hexNumber.length();
1648 if (length<=8) {
1649 char buf[10];
1650 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1651 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1652 if (c<=0x10ffff) {
1653 testString.append(c);
1654 } else {
1655 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1656 fileName, lineNumber);
1657 }
1658 } else {
1659 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1660 fileName, lineNumber);
1661 }
1662 }
1663 else if (tokenMatcher.start(4, status) >= 0) {
1664 // Scanned to end of a line, possibly skipping over a comment in the process.
1665 // If the line from the file contained test data, run the test now.
1666 //
1667 if (testString.length() > 0) {
51004dcb
A
1668// TODO(andy): Remove this time bomb code. Note: Failing line numbers may change when updating to new Unicode data.
1669// Rule 8
1670// ZW SP* <break>
1671// is not yet implemented.
1672if (!(isLineBreak && !isTicket7270Fixed && (5198 == lineNumber ||
1673 5202 == lineNumber ||
1674 5214 == lineNumber ||
1675 5246 == lineNumber ||
1676 5298 == lineNumber ||
1677 5302 == lineNumber ))) {
46f4442e 1678 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
729e4ab9 1679}
73c04bcf
A
1680 }
1681
46f4442e
A
1682 // Clear out this test case.
1683 // The string and breakPositions vector will be refilled as the next
1684 // test case is parsed.
1685 testString.remove();
1686 breakPositions.removeAllElements();
1687 lineNumber++;
1688 } else {
1689 // Scanner catchall. Something unrecognized appeared on the line.
1690 char token[16];
1691 UnicodeString uToken = tokenMatcher.group(0, status);
1692 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1693 token[sizeof(token)-1] = 0;
1694 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token);
1695
1696 // Clean up, in preparation for continuing with the next line.
1697 testString.remove();
1698 breakPositions.removeAllElements();
1699 lineNumber++;
1700 }
1701 TEST_ASSERT_SUCCESS(status);
1702 if (U_FAILURE(status)) {
73c04bcf
A
1703 break;
1704 }
46f4442e 1705 }
73c04bcf 1706
46f4442e
A
1707 delete [] testFile;
1708 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
1709}
73c04bcf 1710
46f4442e
A
1711//--------------------------------------------------------------------------------------------
1712//
1713// checkUnicodeTestCase() Run one test case from one of the Unicode Consortium
1714// test data files. Do only a simple, forward-only check -
1715// this test is mostly to check that ICU and the Unicode
1716// data agree with each other.
1717//
1718//--------------------------------------------------------------------------------------------
1719void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1720 const UnicodeString &testString, // Text data to be broken
1721 UVector32 *breakPositions, // Positions where breaks should be found.
1722 RuleBasedBreakIterator *bi) {
1723 int32_t pos; // Break Position in the test string
1724 int32_t expectedI = 0; // Index of expected break position in the vector of expected results.
1725 int32_t expectedPos; // Expected break position (index into test string)
1726
1727 bi->setText(testString);
1728 pos = bi->first();
1729 pos = bi->next();
1730
1731 while (pos != BreakIterator::DONE) {
1732 if (expectedI >= breakPositions->size()) {
1733 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1734 testFileName, lineNumber, pos);
1735 break;
73c04bcf 1736 }
46f4442e
A
1737 expectedPos = breakPositions->elementAti(expectedI);
1738 if (pos < expectedPos) {
1739 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1740 testFileName, lineNumber, pos);
1741 break;
1742 }
1743 if (pos > expectedPos) {
1744 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1745 testFileName, lineNumber, expectedPos);
73c04bcf
A
1746 break;
1747 }
46f4442e
A
1748 pos = bi->next();
1749 expectedI++;
1750 }
73c04bcf 1751
46f4442e
A
1752 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1753 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1754 testFileName, lineNumber, breakPositions->elementAti(expectedI));
73c04bcf 1755 }
46f4442e 1756}
73c04bcf 1757
73c04bcf 1758
73c04bcf
A
1759
1760#if !UCONFIG_NO_REGULAR_EXPRESSIONS
73c04bcf
A
1761//---------------------------------------------------------------------------------------
1762//
1763// classs RBBIMonkeyKind
1764//
1765// Monkey Test for Break Iteration
1766// Abstract interface class. Concrete derived classes independently
1767// implement the break rules for different iterator types.
1768//
1769// The Monkey Test itself uses doesn't know which type of break iterator it is
1770// testing, but works purely in terms of the interface defined here.
1771//
1772//---------------------------------------------------------------------------------------
1773class RBBIMonkeyKind {
1774public:
1775 // Return a UVector of UnicodeSets, representing the character classes used
1776 // for this type of iterator.
1777 virtual UVector *charClasses() = 0;
1778
1779 // Set the test text on which subsequent calls to next() will operate
1780 virtual void setText(const UnicodeString &s) = 0;
1781
1782 // Find the next break postion, starting from the prev break position, or from zero.
1783 // Return -1 after reaching end of string.
1784 virtual int32_t next(int32_t i) = 0;
1785
1786 virtual ~RBBIMonkeyKind();
1787 UErrorCode deferredStatus;
1788
1789
1790protected:
1791 RBBIMonkeyKind();
1792
1793private:
1794};
1795
1796RBBIMonkeyKind::RBBIMonkeyKind() {
1797 deferredStatus = U_ZERO_ERROR;
1798}
1799
1800RBBIMonkeyKind::~RBBIMonkeyKind() {
1801}
1802
1803
1804//----------------------------------------------------------------------------------------
1805//
1806// Random Numbers. Similar to standard lib rand() and srand()
1807// Not using library to
1808// 1. Get same results on all platforms.
1809// 2. Get access to current seed, to more easily reproduce failures.
1810//
1811//---------------------------------------------------------------------------------------
1812static uint32_t m_seed = 1;
1813
1814static uint32_t m_rand()
1815{
1816 m_seed = m_seed * 1103515245 + 12345;
1817 return (uint32_t)(m_seed/65536) % 32768;
1818}
1819
1820
1821//------------------------------------------------------------------------------------------
1822//
1823// class RBBICharMonkey Character (Grapheme Cluster) specific implementation
1824// of RBBIMonkeyKind.
1825//
1826//------------------------------------------------------------------------------------------
1827class RBBICharMonkey: public RBBIMonkeyKind {
1828public:
1829 RBBICharMonkey();
1830 virtual ~RBBICharMonkey();
1831 virtual UVector *charClasses();
1832 virtual void setText(const UnicodeString &s);
1833 virtual int32_t next(int32_t i);
1834private:
1835 UVector *fSets;
1836
1837 UnicodeSet *fCRLFSet;
1838 UnicodeSet *fControlSet;
1839 UnicodeSet *fExtendSet;
51004dcb 1840 UnicodeSet *fRegionalIndicatorSet;
46f4442e
A
1841 UnicodeSet *fPrependSet;
1842 UnicodeSet *fSpacingSet;
1843 UnicodeSet *fLSet;
1844 UnicodeSet *fVSet;
1845 UnicodeSet *fTSet;
1846 UnicodeSet *fLVSet;
1847 UnicodeSet *fLVTSet;
73c04bcf
A
1848 UnicodeSet *fHangulSet;
1849 UnicodeSet *fAnySet;
1850
73c04bcf
A
1851 const UnicodeString *fText;
1852};
1853
1854
1855RBBICharMonkey::RBBICharMonkey() {
1856 UErrorCode status = U_ZERO_ERROR;
1857
1858 fText = NULL;
73c04bcf 1859
46f4442e
A
1860 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
1861 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status);
1862 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status);
51004dcb 1863 fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
46f4442e
A
1864 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
1865 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
1866 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
1867 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
1868 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
1869 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
1870 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
1871 fHangulSet = new UnicodeSet();
1872 fHangulSet->addAll(*fLSet);
1873 fHangulSet->addAll(*fVSet);
1874 fHangulSet->addAll(*fTSet);
1875 fHangulSet->addAll(*fLVSet);
1876 fHangulSet->addAll(*fLVTSet);
4388f060
A
1877 fAnySet = new UnicodeSet(0, 0x10ffff);
1878
73c04bcf
A
1879 fSets = new UVector(status);
1880 fSets->addElement(fCRLFSet, status);
1881 fSets->addElement(fControlSet, status);
1882 fSets->addElement(fExtendSet, status);
51004dcb 1883 fSets->addElement(fRegionalIndicatorSet, status);
4388f060
A
1884 if (!fPrependSet->isEmpty()) {
1885 fSets->addElement(fPrependSet, status);
1886 }
46f4442e 1887 fSets->addElement(fSpacingSet, status);
73c04bcf
A
1888 fSets->addElement(fHangulSet, status);
1889 fSets->addElement(fAnySet, status);
1890 if (U_FAILURE(status)) {
1891 deferredStatus = status;
1892 }
1893}
1894
1895
1896void RBBICharMonkey::setText(const UnicodeString &s) {
1897 fText = &s;
73c04bcf
A
1898}
1899
1900
73c04bcf 1901
46f4442e
A
1902int32_t RBBICharMonkey::next(int32_t prevPos) {
1903 int p0, p1, p2, p3; // Indices of the significant code points around the
1904 // break position being tested. The candidate break
1905 // location is before p2.
1906
1907 int breakPos = -1;
1908
1909 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
1910
1911 if (U_FAILURE(deferredStatus)) {
1912 return -1;
73c04bcf 1913 }
46f4442e
A
1914
1915 // Previous break at end of string. return DONE.
1916 if (prevPos >= fText->length()) {
1917 return -1;
73c04bcf 1918 }
46f4442e
A
1919 p0 = p1 = p2 = p3 = prevPos;
1920 c3 = fText->char32At(prevPos);
1921 c0 = c1 = c2 = 0;
57a6839d
A
1922 (void)p0; // suppress set but not used warning.
1923 (void)c0;
46f4442e
A
1924
1925 // Loop runs once per "significant" character position in the input text.
1926 for (;;) {
1927 // Move all of the positions forward in the input string.
1928 p0 = p1; c0 = c1;
1929 p1 = p2; c1 = c2;
1930 p2 = p3; c2 = c3;
1931
1932 // Advancd p3 by one codepoint
1933 p3 = fText->moveIndex32(p3, 1);
1934 c3 = fText->char32At(p3);
1935
1936 if (p1 == p2) {
1937 // Still warming up the loop. (won't work with zero length strings, but we don't care)
1938 continue;
1939 }
1940 if (p2 == fText->length()) {
1941 // Reached end of string. Always a break position.
1942 break;
1943 }
1944
1945 // Rule GB3 CR x LF
1946 // No Extend or Format characters may appear between the CR and LF,
1947 // which requires the additional check for p2 immediately following p1.
1948 //
1949 if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
1950 continue;
1951 }
1952
1953 // Rule (GB4). ( Control | CR | LF ) <break>
1954 if (fControlSet->contains(c1) ||
1955 c1 == 0x0D ||
1956 c1 == 0x0A) {
1957 break;
1958 }
1959
1960 // Rule (GB5) <break> ( Control | CR | LF )
1961 //
1962 if (fControlSet->contains(c2) ||
1963 c2 == 0x0D ||
1964 c2 == 0x0A) {
1965 break;
1966 }
1967
1968
1969 // Rule (GB6) L x ( L | V | LV | LVT )
1970 if (fLSet->contains(c1) &&
1971 (fLSet->contains(c2) ||
1972 fVSet->contains(c2) ||
1973 fLVSet->contains(c2) ||
1974 fLVTSet->contains(c2))) {
1975 continue;
1976 }
1977
1978 // Rule (GB7) ( LV | V ) x ( V | T )
1979 if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
1980 (fVSet->contains(c2) || fTSet->contains(c2))) {
1981 continue;
1982 }
1983
1984 // Rule (GB8) ( LVT | T) x T
1985 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
1986 fTSet->contains(c2)) {
1987 continue;
1988 }
1989
51004dcb
A
1990 // Just adding extra Apple rule does here not work, behavior depends on arbitrary context
1991
1992 // Rule (GB8a) Regional_Indicator x Regional_Indicator
1993 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
1994 continue;
1995 }
1996
46f4442e
A
1997 // Rule (GB9) Numeric x ALetter
1998 if (fExtendSet->contains(c2)) {
1999 continue;
2000 }
2001
2002 // Rule (GB9a) x SpacingMark
2003 if (fSpacingSet->contains(c2)) {
2004 continue;
2005 }
2006
2007 // Rule (GB9b) Prepend x
2008 if (fPrependSet->contains(c1)) {
2009 continue;
2010 }
2011
2012 // Rule (GB10) Any <break> Any
2013 break;
2014 }
2015
2016 breakPos = p2;
2017 return breakPos;
73c04bcf
A
2018}
2019
2020
46f4442e 2021
73c04bcf
A
2022UVector *RBBICharMonkey::charClasses() {
2023 return fSets;
2024}
2025
2026
2027RBBICharMonkey::~RBBICharMonkey() {
2028 delete fSets;
2029 delete fCRLFSet;
2030 delete fControlSet;
2031 delete fExtendSet;
51004dcb 2032 delete fRegionalIndicatorSet;
46f4442e
A
2033 delete fPrependSet;
2034 delete fSpacingSet;
2035 delete fLSet;
2036 delete fVSet;
2037 delete fTSet;
2038 delete fLVSet;
2039 delete fLVTSet;
73c04bcf
A
2040 delete fHangulSet;
2041 delete fAnySet;
73c04bcf
A
2042}
2043
2044//------------------------------------------------------------------------------------------
2045//
2046// class RBBIWordMonkey Word Break specific implementation
2047// of RBBIMonkeyKind.
2048//
2049//------------------------------------------------------------------------------------------
2050class RBBIWordMonkey: public RBBIMonkeyKind {
2051public:
2052 RBBIWordMonkey();
2053 virtual ~RBBIWordMonkey();
2054 virtual UVector *charClasses();
2055 virtual void setText(const UnicodeString &s);
2056 virtual int32_t next(int32_t i);
2057private:
2058 UVector *fSets;
2059
46f4442e
A
2060 UnicodeSet *fCRSet;
2061 UnicodeSet *fLFSet;
2062 UnicodeSet *fNewlineSet;
57a6839d 2063 UnicodeSet *fRegionalIndicatorSet;
73c04bcf 2064 UnicodeSet *fKatakanaSet;
57a6839d 2065 UnicodeSet *fHebrew_LetterSet;
73c04bcf 2066 UnicodeSet *fALetterSet;
51004dcb
A
2067 // TODO(jungshik): Do we still need this change?
2068 // UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt
57a6839d
A
2069 UnicodeSet *fSingle_QuoteSet;
2070 UnicodeSet *fDouble_QuoteSet;
46f4442e 2071 UnicodeSet *fMidNumLetSet;
73c04bcf
A
2072 UnicodeSet *fMidLetterSet;
2073 UnicodeSet *fMidNumSet;
2074 UnicodeSet *fNumericSet;
2075 UnicodeSet *fFormatSet;
2076 UnicodeSet *fOtherSet;
2077 UnicodeSet *fExtendSet;
2078 UnicodeSet *fExtendNumLetSet;
51004dcb 2079 UnicodeSet *fDictionaryCjkSet;
73c04bcf 2080
73c04bcf
A
2081 const UnicodeString *fText;
2082};
2083
2084
46f4442e 2085RBBIWordMonkey::RBBIWordMonkey()
73c04bcf
A
2086{
2087 UErrorCode status = U_ZERO_ERROR;
2088
73c04bcf
A
2089 fSets = new UVector(status);
2090
46f4442e
A
2091 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status);
2092 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status);
2093 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status);
51004dcb
A
2094 fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status);
2095 // Exclude Hangul syllables from ALetterSet during testing.
2096 // Leave CJK dictionary characters out from the monkey tests!
2097#if 0
2098 fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}"
2099 "[\\p{Line_Break = Complex_Context}"
2100 "-\\p{Grapheme_Cluster_Break = Extend}"
2101 "-\\p{Grapheme_Cluster_Break = Control}"
2102 "]]",
2103 status);
2104#endif
57a6839d
A
2105 fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status);
2106 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status);
2107 fHebrew_LetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Hebrew_Letter}]"), status);
2108 fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
51004dcb 2109 fALetterSet->removeAll(*fDictionaryCjkSet);
57a6839d
A
2110 fSingle_QuoteSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Single_Quote}]"), status);
2111 fDouble_QuoteSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Double_Quote}]"), status);
2112 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status);
2113 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status);
2114 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status);
51004dcb
A
2115 // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test
2116 // we should figure out why
57a6839d
A
2117 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status);
2118 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status);
2119 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
2120 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status);
46f4442e 2121
73c04bcf
A
2122 fOtherSet = new UnicodeSet();
2123 if(U_FAILURE(status)) {
2124 deferredStatus = status;
2125 return;
2126 }
2127
2128 fOtherSet->complement();
46f4442e
A
2129 fOtherSet->removeAll(*fCRSet);
2130 fOtherSet->removeAll(*fLFSet);
2131 fOtherSet->removeAll(*fNewlineSet);
73c04bcf 2132 fOtherSet->removeAll(*fKatakanaSet);
57a6839d 2133 fOtherSet->removeAll(*fHebrew_LetterSet);
73c04bcf 2134 fOtherSet->removeAll(*fALetterSet);
57a6839d
A
2135 fOtherSet->removeAll(*fSingle_QuoteSet);
2136 fOtherSet->removeAll(*fDouble_QuoteSet);
73c04bcf
A
2137 fOtherSet->removeAll(*fMidLetterSet);
2138 fOtherSet->removeAll(*fMidNumSet);
2139 fOtherSet->removeAll(*fNumericSet);
2140 fOtherSet->removeAll(*fExtendNumLetSet);
2141 fOtherSet->removeAll(*fFormatSet);
2142 fOtherSet->removeAll(*fExtendSet);
51004dcb 2143 fOtherSet->removeAll(*fRegionalIndicatorSet);
46f4442e 2144 // Inhibit dictionary characters from being tested at all.
51004dcb 2145 fOtherSet->removeAll(*fDictionaryCjkSet);
46f4442e 2146 fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
73c04bcf 2147
57a6839d
A
2148 fSets->addElement(fCRSet, status);
2149 fSets->addElement(fLFSet, status);
2150 fSets->addElement(fNewlineSet, status);
51004dcb 2151 fSets->addElement(fRegionalIndicatorSet, status);
57a6839d
A
2152 fSets->addElement(fHebrew_LetterSet, status);
2153 fSets->addElement(fALetterSet, status);
2154 fSets->addElement(fSingle_QuoteSet, status);
2155 fSets->addElement(fDouble_QuoteSet, status);
2156 //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test katakana
2157 fSets->addElement(fMidLetterSet, status);
2158 fSets->addElement(fMidNumLetSet, status);
2159 fSets->addElement(fMidNumSet, status);
2160 fSets->addElement(fNumericSet, status);
2161 fSets->addElement(fFormatSet, status);
2162 fSets->addElement(fExtendSet, status);
2163 fSets->addElement(fOtherSet, status);
2164 fSets->addElement(fExtendNumLetSet, status);
73c04bcf 2165
73c04bcf
A
2166 if (U_FAILURE(status)) {
2167 deferredStatus = status;
2168 }
2169}
2170
2171void RBBIWordMonkey::setText(const UnicodeString &s) {
2172 fText = &s;
2173}
2174
2175
2176int32_t RBBIWordMonkey::next(int32_t prevPos) {
2177 int p0, p1, p2, p3; // Indices of the significant code points around the
2178 // break position being tested. The candidate break
2179 // location is before p2.
2180
2181 int breakPos = -1;
2182
2183 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
46f4442e
A
2184
2185 if (U_FAILURE(deferredStatus)) {
2186 return -1;
2187 }
73c04bcf
A
2188
2189 // Prev break at end of string. return DONE.
2190 if (prevPos >= fText->length()) {
2191 return -1;
2192 }
2193 p0 = p1 = p2 = p3 = prevPos;
2194 c3 = fText->char32At(prevPos);
2195 c0 = c1 = c2 = 0;
57a6839d 2196 (void)p0; // Suppress set but not used warning.
73c04bcf
A
2197
2198 // Loop runs once per "significant" character position in the input text.
2199 for (;;) {
2200 // Move all of the positions forward in the input string.
2201 p0 = p1; c0 = c1;
2202 p1 = p2; c1 = c2;
2203 p2 = p3; c2 = c3;
2204
2205 // Advancd p3 by X(Extend | Format)* Rule 4
46f4442e 2206 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
73c04bcf
A
2207 do {
2208 p3 = fText->moveIndex32(p3, 1);
2209 c3 = fText->char32At(p3);
46f4442e
A
2210 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2211 break;
2212 };
73c04bcf
A
2213 }
2214 while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
2215
2216
2217 if (p1 == p2) {
2218 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2219 continue;
2220 }
2221 if (p2 == fText->length()) {
2222 // Reached end of string. Always a break position.
2223 break;
2224 }
46f4442e 2225
73c04bcf
A
2226 // Rule (3) CR x LF
2227 // No Extend or Format characters may appear between the CR and LF,
2228 // which requires the additional check for p2 immediately following p1.
2229 //
46f4442e 2230 if (c1==0x0D && c2==0x0A) {
73c04bcf
A
2231 continue;
2232 }
46f4442e
A
2233
2234 // Rule (3a) Break before and after newlines (including CR and LF)
2235 //
2236 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2237 break;
2238 };
2239 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2240 break;
2241 };
73c04bcf 2242
57a6839d
A
2243 // Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
2244 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2245 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
73c04bcf
A
2246 continue;
2247 }
2248
57a6839d 2249 // Rule (6) (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
73c04bcf 2250 //
57a6839d
A
2251 if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2252 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2253 (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2254 continue;
2255 }
2256
2257 // Rule (7) (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter)
2258 if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2259 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2260 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
73c04bcf
A
2261 continue;
2262 }
2263
57a6839d
A
2264 // Rule (7a) Hebrew_Letter x Single_Quote
2265 if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2266 continue;
2267 }
73c04bcf 2268
57a6839d
A
2269 // Rule (7b) Hebrew_Letter x Double_Quote Hebrew_Letter
2270 if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2271 continue;
2272 }
2273
2274 // Rule (7c) Hebrew_Letter Double_Quote x Hebrew_Letter
2275 if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
73c04bcf
A
2276 continue;
2277 }
2278
2279 // Rule (8) Numeric x Numeric
2280 if (fNumericSet->contains(c1) &&
2281 fNumericSet->contains(c2)) {
2282 continue;
2283 }
2284
57a6839d
A
2285 // Rule (9) (ALetter | Hebrew_Letter) x Numeric
2286 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
73c04bcf
A
2287 fNumericSet->contains(c2)) {
2288 continue;
2289 }
2290
57a6839d 2291 // Rule (10) Numeric x (ALetter | Hebrew_Letter)
73c04bcf 2292 if (fNumericSet->contains(c1) &&
57a6839d 2293 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
73c04bcf
A
2294 continue;
2295 }
2296
57a6839d 2297 // Rule (11) Numeric (MidNum | MidNumLet | Single_Quote) x Numeric
46f4442e 2298 if (fNumericSet->contains(c0) &&
57a6839d 2299 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
73c04bcf
A
2300 fNumericSet->contains(c2)) {
2301 continue;
2302 }
2303
57a6839d 2304 // Rule (12) Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
73c04bcf 2305 if (fNumericSet->contains(c1) &&
57a6839d 2306 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
73c04bcf
A
2307 fNumericSet->contains(c3)) {
2308 continue;
2309 }
2310
2311 // Rule (13) Katakana x Katakana
2312 if (fKatakanaSet->contains(c1) &&
2313 fKatakanaSet->contains(c2)) {
2314 continue;
2315 }
2316
57a6839d
A
2317 // Rule 13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
2318 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
73c04bcf
A
2319 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2320 fExtendNumLetSet->contains(c2)) {
2321 continue;
51004dcb 2322 }
73c04bcf 2323
57a6839d 2324 // Rule 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
73c04bcf 2325 if (fExtendNumLetSet->contains(c1) &&
57a6839d
A
2326 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2327 fNumericSet->contains(c2) || fKatakanaSet->contains(c2))) {
2328 continue;
51004dcb
A
2329 }
2330
2331 // Rule 13c
2332 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2333 continue;
2334 }
73c04bcf
A
2335
2336 // Rule 14. Break found here.
2337 break;
2338 }
2339
2340 breakPos = p2;
2341 return breakPos;
2342}
2343
2344
2345UVector *RBBIWordMonkey::charClasses() {
2346 return fSets;
2347}
2348
2349
2350RBBIWordMonkey::~RBBIWordMonkey() {
2351 delete fSets;
46f4442e
A
2352 delete fCRSet;
2353 delete fLFSet;
2354 delete fNewlineSet;
73c04bcf 2355 delete fKatakanaSet;
57a6839d 2356 delete fHebrew_LetterSet;
73c04bcf 2357 delete fALetterSet;
57a6839d
A
2358 delete fSingle_QuoteSet;
2359 delete fDouble_QuoteSet;
46f4442e 2360 delete fMidNumLetSet;
73c04bcf
A
2361 delete fMidLetterSet;
2362 delete fMidNumSet;
2363 delete fNumericSet;
2364 delete fFormatSet;
2365 delete fExtendSet;
2366 delete fExtendNumLetSet;
51004dcb
A
2367 delete fRegionalIndicatorSet;
2368 delete fDictionaryCjkSet;
73c04bcf
A
2369 delete fOtherSet;
2370}
2371
2372
2373
2374
2375//------------------------------------------------------------------------------------------
2376//
2377// class RBBISentMonkey Sentence Break specific implementation
2378// of RBBIMonkeyKind.
2379//
2380//------------------------------------------------------------------------------------------
2381class RBBISentMonkey: public RBBIMonkeyKind {
2382public:
2383 RBBISentMonkey();
2384 virtual ~RBBISentMonkey();
2385 virtual UVector *charClasses();
2386 virtual void setText(const UnicodeString &s);
2387 virtual int32_t next(int32_t i);
2388private:
2389 int moveBack(int posFrom);
2390 int moveForward(int posFrom);
2391 UChar32 cAt(int pos);
2392
2393 UVector *fSets;
2394
2395 UnicodeSet *fSepSet;
2396 UnicodeSet *fFormatSet;
2397 UnicodeSet *fSpSet;
2398 UnicodeSet *fLowerSet;
2399 UnicodeSet *fUpperSet;
2400 UnicodeSet *fOLetterSet;
2401 UnicodeSet *fNumericSet;
2402 UnicodeSet *fATermSet;
46f4442e 2403 UnicodeSet *fSContinueSet;
73c04bcf
A
2404 UnicodeSet *fSTermSet;
2405 UnicodeSet *fCloseSet;
2406 UnicodeSet *fOtherSet;
2407 UnicodeSet *fExtendSet;
2408
2409 const UnicodeString *fText;
2410
2411};
2412
2413RBBISentMonkey::RBBISentMonkey()
2414{
2415 UErrorCode status = U_ZERO_ERROR;
2416
2417 fSets = new UVector(status);
2418
46f4442e
A
2419 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
2420 // set and made into character classes of their own. For the monkey impl,
2421 // they remain in SEP, since Sep always appears with CR and LF in the rules.
2422 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status);
2423 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status);
2424 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status);
2425 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status);
2426 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status);
2427 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status);
2428 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status);
2429 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status);
2430 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2431 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status);
2432 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status);
2433 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status);
73c04bcf
A
2434 fOtherSet = new UnicodeSet();
2435
2436 if(U_FAILURE(status)) {
2437 deferredStatus = status;
2438 return;
2439 }
2440
2441 fOtherSet->complement();
2442 fOtherSet->removeAll(*fSepSet);
2443 fOtherSet->removeAll(*fFormatSet);
2444 fOtherSet->removeAll(*fSpSet);
2445 fOtherSet->removeAll(*fLowerSet);
2446 fOtherSet->removeAll(*fUpperSet);
2447 fOtherSet->removeAll(*fOLetterSet);
2448 fOtherSet->removeAll(*fNumericSet);
2449 fOtherSet->removeAll(*fATermSet);
46f4442e 2450 fOtherSet->removeAll(*fSContinueSet);
73c04bcf
A
2451 fOtherSet->removeAll(*fSTermSet);
2452 fOtherSet->removeAll(*fCloseSet);
2453 fOtherSet->removeAll(*fExtendSet);
2454
46f4442e
A
2455 fSets->addElement(fSepSet, status);
2456 fSets->addElement(fFormatSet, status);
2457 fSets->addElement(fSpSet, status);
2458 fSets->addElement(fLowerSet, status);
2459 fSets->addElement(fUpperSet, status);
2460 fSets->addElement(fOLetterSet, status);
2461 fSets->addElement(fNumericSet, status);
2462 fSets->addElement(fATermSet, status);
2463 fSets->addElement(fSContinueSet, status);
2464 fSets->addElement(fSTermSet, status);
2465 fSets->addElement(fCloseSet, status);
2466 fSets->addElement(fOtherSet, status);
2467 fSets->addElement(fExtendSet, status);
73c04bcf
A
2468
2469 if (U_FAILURE(status)) {
2470 deferredStatus = status;
2471 }
2472}
2473
2474
2475
2476void RBBISentMonkey::setText(const UnicodeString &s) {
2477 fText = &s;
2478}
2479
2480UVector *RBBISentMonkey::charClasses() {
2481 return fSets;
2482}
2483
2484
2485// moveBack() Find the "significant" code point preceding the index i.
2486// Skips over ($Extend | $Format)* .
46f4442e 2487//
73c04bcf
A
2488int RBBISentMonkey::moveBack(int i) {
2489 if (i <= 0) {
2490 return -1;
2491 }
2492 UChar32 c;
2493 int32_t j = i;
2494 do {
2495 j = fText->moveIndex32(j, -1);
2496 c = fText->char32At(j);
2497 }
2498 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2499 return j;
2500
2501 }
2502
2503
2504int RBBISentMonkey::moveForward(int i) {
2505 if (i>=fText->length()) {
2506 return fText->length();
2507 }
2508 UChar32 c;
2509 int32_t j = i;
2510 do {
2511 j = fText->moveIndex32(j, 1);
2512 c = cAt(j);
2513 }
2514 while (fFormatSet->contains(c) || fExtendSet->contains(c));
2515 return j;
2516}
2517
2518UChar32 RBBISentMonkey::cAt(int pos) {
2519 if (pos<0 || pos>=fText->length()) {
2520 return -1;
2521 } else {
2522 return fText->char32At(pos);
2523 }
2524}
2525
2526int32_t RBBISentMonkey::next(int32_t prevPos) {
2527 int p0, p1, p2, p3; // Indices of the significant code points around the
2528 // break position being tested. The candidate break
2529 // location is before p2.
2530
2531 int breakPos = -1;
2532
2533 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2534 UChar32 c;
2535
46f4442e
A
2536 if (U_FAILURE(deferredStatus)) {
2537 return -1;
2538 }
2539
73c04bcf
A
2540 // Prev break at end of string. return DONE.
2541 if (prevPos >= fText->length()) {
2542 return -1;
2543 }
2544 p0 = p1 = p2 = p3 = prevPos;
2545 c3 = fText->char32At(prevPos);
2546 c0 = c1 = c2 = 0;
57a6839d 2547 (void)p0; // Suppress set but not used warning.
73c04bcf
A
2548
2549 // Loop runs once per "significant" character position in the input text.
2550 for (;;) {
2551 // Move all of the positions forward in the input string.
2552 p0 = p1; c0 = c1;
2553 p1 = p2; c1 = c2;
2554 p2 = p3; c2 = c3;
46f4442e 2555
73c04bcf
A
2556 // Advancd p3 by X(Extend | Format)* Rule 4
2557 p3 = moveForward(p3);
2558 c3 = cAt(p3);
2559
2560 // Rule (3) CR x LF
2561 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2562 continue;
2563 }
46f4442e 2564
73c04bcf
A
2565 // Rule (4). Sep <break>
2566 if (fSepSet->contains(c1)) {
2567 p2 = p1+1; // Separators don't combine with Extend or Format.
2568 break;
2569 }
2570
2571 if (p2 >= fText->length()) {
2572 // Reached end of string. Always a break position.
2573 break;
2574 }
2575
2576 if (p2 == prevPos) {
2577 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2578 continue;
2579 }
46f4442e 2580
73c04bcf
A
2581 // Rule (6). ATerm x Numeric
2582 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) {
2583 continue;
2584 }
2585
2586 // Rule (7). Upper ATerm x Uppper
2587 if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2588 continue;
2589 }
2590
2591 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2592 // Note: STerm | ATerm are added to the negated part of the expression by a
2593 // note to the Unicode 5.0 documents.
2594 int p8 = p1;
2595 while (fSpSet->contains(cAt(p8))) {
2596 p8 = moveBack(p8);
2597 }
2598 while (fCloseSet->contains(cAt(p8))) {
2599 p8 = moveBack(p8);
2600 }
2601 if (fATermSet->contains(cAt(p8))) {
2602 p8=p2;
2603 for (;;) {
2604 c = cAt(p8);
2605 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2606 fLowerSet->contains(c) || fSepSet->contains(c) ||
2607 fATermSet->contains(c) || fSTermSet->contains(c)) {
2608 break;
2609 }
2610 p8 = moveForward(p8);
2611 }
2612 if (fLowerSet->contains(cAt(p8))) {
2613 continue;
2614 }
2615 }
46f4442e
A
2616
2617 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2618 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
73c04bcf
A
2619 p8 = p1;
2620 while (fSpSet->contains(cAt(p8))) {
2621 p8 = moveBack(p8);
2622 }
2623 while (fCloseSet->contains(cAt(p8))) {
2624 p8 = moveBack(p8);
2625 }
2626 c = cAt(p8);
2627 if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2628 continue;
2629 }
2630 }
2631
46f4442e 2632 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)
73c04bcf
A
2633 int p9 = p1;
2634 while (fCloseSet->contains(cAt(p9))) {
2635 p9 = moveBack(p9);
2636 }
2637 c = cAt(p9);
2638 if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2639 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2640 continue;
2641 }
2642 }
2643
46f4442e 2644 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)
73c04bcf
A
2645 int p10 = p1;
2646 while (fSpSet->contains(cAt(p10))) {
2647 p10 = moveBack(p10);
2648 }
2649 while (fCloseSet->contains(cAt(p10))) {
2650 p10 = moveBack(p10);
2651 }
2652 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2653 if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2654 continue;
2655 }
2656 }
2657
46f4442e 2658 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break>
73c04bcf 2659 int p11 = p1;
46f4442e
A
2660 if (fSepSet->contains(cAt(p11))) {
2661 p11 = moveBack(p11);
2662 }
73c04bcf
A
2663 while (fSpSet->contains(cAt(p11))) {
2664 p11 = moveBack(p11);
2665 }
2666 while (fCloseSet->contains(cAt(p11))) {
2667 p11 = moveBack(p11);
2668 }
2669 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2670 break;
2671 }
2672
2673 // Rule (12) Any x Any
2674 continue;
2675 }
2676 breakPos = p2;
2677 return breakPos;
2678}
2679
2680RBBISentMonkey::~RBBISentMonkey() {
2681 delete fSets;
2682 delete fSepSet;
2683 delete fFormatSet;
2684 delete fSpSet;
2685 delete fLowerSet;
2686 delete fUpperSet;
2687 delete fOLetterSet;
2688 delete fNumericSet;
2689 delete fATermSet;
46f4442e 2690 delete fSContinueSet;
73c04bcf
A
2691 delete fSTermSet;
2692 delete fCloseSet;
2693 delete fOtherSet;
2694 delete fExtendSet;
2695}
2696
2697
2698
2699//-------------------------------------------------------------------------------------------
2700//
2701// RBBILineMonkey
2702//
2703//-------------------------------------------------------------------------------------------
2704
2705class RBBILineMonkey: public RBBIMonkeyKind {
2706public:
2707 RBBILineMonkey();
2708 virtual ~RBBILineMonkey();
2709 virtual UVector *charClasses();
2710 virtual void setText(const UnicodeString &s);
2711 virtual int32_t next(int32_t i);
2712 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2713private:
2714 UVector *fSets;
2715
2716 UnicodeSet *fBK;
2717 UnicodeSet *fCR;
2718 UnicodeSet *fLF;
2719 UnicodeSet *fCM;
2720 UnicodeSet *fNL;
2721 UnicodeSet *fSG;
2722 UnicodeSet *fWJ;
2723 UnicodeSet *fZW;
2724 UnicodeSet *fGL;
2725 UnicodeSet *fCB;
2726 UnicodeSet *fSP;
2727 UnicodeSet *fB2;
2728 UnicodeSet *fBA;
2729 UnicodeSet *fBB;
2730 UnicodeSet *fHY;
2731 UnicodeSet *fH2;
2732 UnicodeSet *fH3;
2733 UnicodeSet *fCL;
729e4ab9 2734 UnicodeSet *fCP;
73c04bcf
A
2735 UnicodeSet *fEX;
2736 UnicodeSet *fIN;
2737 UnicodeSet *fJL;
2738 UnicodeSet *fJV;
2739 UnicodeSet *fJT;
2740 UnicodeSet *fNS;
2741 UnicodeSet *fOP;
2742 UnicodeSet *fQU;
2743 UnicodeSet *fIS;
2744 UnicodeSet *fNU;
2745 UnicodeSet *fPO;
2746 UnicodeSet *fPR;
2747 UnicodeSet *fSY;
2748 UnicodeSet *fAI;
2749 UnicodeSet *fAL;
4388f060
A
2750 UnicodeSet *fCJ;
2751 UnicodeSet *fHL;
73c04bcf 2752 UnicodeSet *fID;
51004dcb 2753 UnicodeSet *fRI;
73c04bcf
A
2754 UnicodeSet *fSA;
2755 UnicodeSet *fXX;
2756
57a6839d 2757 BreakIterator *fCharBI;
73c04bcf 2758 const UnicodeString *fText;
73c04bcf 2759 RegexMatcher *fNumberMatcher;
73c04bcf
A
2760};
2761
2762
2763RBBILineMonkey::RBBILineMonkey()
2764{
2765 UErrorCode status = U_ZERO_ERROR;
2766
2767 fSets = new UVector(status);
2768
46f4442e
A
2769 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2770 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2771 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2772 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2773 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2774 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2775 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2776 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2777 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2778 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2779 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2780 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2781 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2782 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2783 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2784 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2785 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
729e4ab9 2786 fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
46f4442e
A
2787 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2788 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2789 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2790 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2791 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2792 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2793 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
2794 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
2795 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2796 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2797 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2798 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2799 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2800 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2801 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
4388f060
A
2802 fCJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2803 fHL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
46f4442e 2804 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
51004dcb 2805 fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
46f4442e
A
2806 fSA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status);
2807 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2808 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
73c04bcf
A
2809
2810 if (U_FAILURE(status)) {
2811 deferredStatus = status;
2812 fCharBI = NULL;
2813 fNumberMatcher = NULL;
2814 return;
2815 }
2816
2817 fAL->addAll(*fXX); // Default behavior for XX is identical to AL
2818 fAL->addAll(*fAI); // Default behavior for AI is identical to AL
2819 fAL->addAll(*fSA); // Default behavior for SA is XX, which defaults to AL
2820 fAL->addAll(*fSG); // Default behavior for SG is identical to AL.
2821
4388f060
A
2822 fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS.
2823
73c04bcf
A
2824 fSets->addElement(fBK, status);
2825 fSets->addElement(fCR, status);
2826 fSets->addElement(fLF, status);
2827 fSets->addElement(fCM, status);
2828 fSets->addElement(fNL, status);
2829 fSets->addElement(fWJ, status);
2830 fSets->addElement(fZW, status);
2831 fSets->addElement(fGL, status);
2832 fSets->addElement(fCB, status);
2833 fSets->addElement(fSP, status);
2834 fSets->addElement(fB2, status);
2835 fSets->addElement(fBA, status);
2836 fSets->addElement(fBB, status);
2837 fSets->addElement(fHY, status);
2838 fSets->addElement(fH2, status);
2839 fSets->addElement(fH3, status);
2840 fSets->addElement(fCL, status);
729e4ab9 2841 fSets->addElement(fCP, status);
73c04bcf
A
2842 fSets->addElement(fEX, status);
2843 fSets->addElement(fIN, status);
2844 fSets->addElement(fJL, status);
2845 fSets->addElement(fJT, status);
2846 fSets->addElement(fJV, status);
2847 fSets->addElement(fNS, status);
2848 fSets->addElement(fOP, status);
2849 fSets->addElement(fQU, status);
2850 fSets->addElement(fIS, status);
2851 fSets->addElement(fNU, status);
2852 fSets->addElement(fPO, status);
2853 fSets->addElement(fPR, status);
2854 fSets->addElement(fSY, status);
2855 fSets->addElement(fAI, status);
2856 fSets->addElement(fAL, status);
4388f060 2857 fSets->addElement(fHL, status);
73c04bcf
A
2858 fSets->addElement(fID, status);
2859 fSets->addElement(fWJ, status);
51004dcb 2860 fSets->addElement(fRI, status);
73c04bcf
A
2861 fSets->addElement(fSA, status);
2862 fSets->addElement(fSG, status);
2863
46f4442e
A
2864 const char *rules =
2865 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
2866 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
2867 "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
2868 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
729e4ab9 2869 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
46f4442e
A
2870 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
2871
73c04bcf 2872 fNumberMatcher = new RegexMatcher(
46f4442e 2873 UnicodeString(rules, -1, US_INV), 0, status);
73c04bcf
A
2874
2875 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2876
2877 if (U_FAILURE(status)) {
2878 deferredStatus = status;
2879 }
2880}
2881
2882
2883void RBBILineMonkey::setText(const UnicodeString &s) {
2884 fText = &s;
2885 fCharBI->setText(s);
2886 fNumberMatcher->reset(s);
2887}
2888
2889//
2890// rule9Adjust
2891// Line Break TR rules 9 and 10 implementation.
2892// This deals with combining marks and other sequences that
2893// that must be treated as if they were something other than what they actually are.
2894//
2895// This is factored out into a separate function because it must be applied twice for
2896// each potential break, once to the chars before the position being checked, then
2897// again to the text following the possible break.
2898//
2899void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2900 if (pos == -1) {
2901 // Invalid initial position. Happens during the warmup iteration of the
2902 // main loop in next().
2903 return;
2904 }
2905
2906 int32_t nPos = *nextPos;
2907
2908 // LB 9 Keep combining sequences together.
2909 // advance over any CM class chars. Note that Line Break CM is different
2910 // from the normal Grapheme Extend property.
2911 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
2912 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
2913 for (;;) {
2914 *nextChar = fText->char32At(nPos);
2915 if (!fCM->contains(*nextChar)) {
2916 break;
2917 }
2918 nPos = fText->moveIndex32(nPos, 1);
2919 }
2920 }
2921
2922
2923 // LB 9 Treat X CM* as if it were x.
2924 // No explicit action required.
2925
2926 // LB 10 Treat any remaining combining mark as AL
2927 if (fCM->contains(*posChar)) {
2928 *posChar = 0x41; // thisChar = 'A';
2929 }
2930
2931 // Push the updated nextPos and nextChar back to our caller.
2932 // This only makes a difference if posChar got bigger by consuming a
2933 // combining sequence.
2934 *nextPos = nPos;
2935 *nextChar = fText->char32At(nPos);
2936}
2937
2938
2939
2940int32_t RBBILineMonkey::next(int32_t startPos) {
2941 UErrorCode status = U_ZERO_ERROR;
2942 int32_t pos; // Index of the char following a potential break position
2943 UChar32 thisChar; // Character at above position "pos"
2944
2945 int32_t prevPos; // Index of the char preceding a potential break position
2946 UChar32 prevChar; // Character at above position. Note that prevChar
2947 // and thisChar may not be adjacent because combining
2948 // characters between them will be ignored.
2949
4388f060
A
2950 int32_t prevPosX2; // Second previous character. Wider context for LB21a.
2951 UChar32 prevCharX2;
2952
73c04bcf
A
2953 int32_t nextPos; // Index of the next character following pos.
2954 // Usually skips over combining marks.
2955 int32_t nextCPPos; // Index of the code point following "pos."
2956 // May point to a combining mark.
2957 int32_t tPos; // temp value.
2958 UChar32 c;
2959
46f4442e
A
2960 if (U_FAILURE(deferredStatus)) {
2961 return -1;
2962 }
2963
73c04bcf
A
2964 if (startPos >= fText->length()) {
2965 return -1;
2966 }
2967
2968
2969 // Initial values for loop. Loop will run the first time without finding breaks,
2970 // while the invalid values shift out and the "this" and
2971 // "prev" positions are filled in with good values.
4388f060
A
2972 pos = prevPos = prevPosX2 = -1; // Invalid value, serves as flag for initial loop iteration.
2973 thisChar = prevChar = prevCharX2 = 0;
73c04bcf
A
2974 nextPos = nextCPPos = startPos;
2975
2976
2977 // Loop runs once per position in the test text, until a break position
2978 // is found.
2979 for (;;) {
4388f060
A
2980 prevPosX2 = prevPos;
2981 prevCharX2 = prevChar;
2982
73c04bcf
A
2983 prevPos = pos;
2984 prevChar = thisChar;
2985
2986 pos = nextPos;
2987 thisChar = fText->char32At(pos);
2988
2989 nextCPPos = fText->moveIndex32(pos, 1);
2990 nextPos = nextCPPos;
2991
2992 // Rule LB2 - Break at end of text.
2993 if (pos >= fText->length()) {
2994 break;
2995 }
2996
2997 // Rule LB 9 - adjust for combining sequences.
2998 // We do this one out-of-order because the adjustment does not change anything
2999 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3000 // be applied.
3001 rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
3002 nextCPPos = nextPos = fText->moveIndex32(pos, 1);
3003 c = fText->char32At(nextPos);
3004 rule9Adjust(pos, &thisChar, &nextPos, &c);
3005
3006 // If the loop is still warming up - if we haven't shifted the initial
3007 // -1 positions out of prevPos yet - loop back to advance the
3008 // position in the input without any further looking for breaks.
3009 if (prevPos == -1) {
3010 continue;
3011 }
46f4442e 3012
73c04bcf
A
3013 // LB 4 Always break after hard line breaks,
3014 if (fBK->contains(prevChar)) {
3015 break;
3016 }
3017
3018 // LB 5 Break after CR, LF, NL, but not inside CR LF
3019 if (prevChar == 0x0d && thisChar == 0x0a) {
3020 continue;
3021 }
3022 if (prevChar == 0x0d ||
3023 prevChar == 0x0a ||
3024 prevChar == 0x85) {
3025 break;
3026 }
3027
3028 // LB 6 Don't break before hard line breaks
3029 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3030 fBK->contains(thisChar)) {
3031 continue;
3032 }
3033
3034
3035 // LB 7 Don't break before spaces or zero-width space.
3036 if (fSP->contains(thisChar)) {
3037 continue;
3038 }
3039
3040 if (fZW->contains(thisChar)) {
3041 continue;
3042 }
3043
3044 // LB 8 Break after zero width space
3045 if (fZW->contains(prevChar)) {
3046 break;
3047 }
3048
3049 // LB 9, 10 Already done, at top of loop.
3050 //
3051
3052
3053 // LB 11 Do not break before or after WORD JOINER and related characters.
3054 // x WJ
3055 // WJ x
3056 //
3057 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3058 continue;
3059 }
3060
3061 // LB 12
73c04bcf 3062 // GL x
46f4442e 3063 if (fGL->contains(prevChar)) {
73c04bcf
A
3064 continue;
3065 }
3066
46f4442e
A
3067 // LB 12a
3068 // [^SP BA HY] x GL
3069 if (!(fSP->contains(prevChar) ||
3070 fBA->contains(prevChar) ||
3071 fHY->contains(prevChar) ) && fGL->contains(thisChar)) {
3072 continue;
3073 }
3074
3075
73c04bcf
A
3076
3077 // LB 13 Don't break before closings.
729e4ab9 3078 // NU x CL, NU x CP and NU x IS are not matched here so that they will
73c04bcf
A
3079 // fall into LB 17 and the more general number regular expression.
3080 //
729e4ab9
A
3081 if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
3082 (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
3083 fEX->contains(thisChar) ||
3084 (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
3085 (!fNU->contains(prevChar) && fSY->contains(thisChar))) {
73c04bcf
A
3086 continue;
3087 }
3088
3089 // LB 14 Don't break after OP SP*
3090 // Scan backwards, checking for this sequence.
3091 // The OP char could include combining marks, so we actually check for
3092 // OP CM* SP*
3093 // Another Twist: The Rule 67 fixes may have changed a SP CM
3094 // sequence into a ID char, so before scanning back through spaces,
3095 // verify that prevChar is indeed a space. The prevChar variable
3096 // may differ from fText[prevPos]
3097 tPos = prevPos;
3098 if (fSP->contains(prevChar)) {
3099 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3100 tPos=fText->moveIndex32(tPos, -1);
3101 }
3102 }
3103 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3104 tPos=fText->moveIndex32(tPos, -1);
3105 }
3106 if (fOP->contains(fText->char32At(tPos))) {
3107 continue;
3108 }
3109
3110
3111 // LB 15 QU SP* x OP
3112 if (fOP->contains(thisChar)) {
3113 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3114 int tPos = prevPos;
3115 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3116 tPos = fText->moveIndex32(tPos, -1);
3117 }
3118 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3119 tPos = fText->moveIndex32(tPos, -1);
3120 }
3121 if (fQU->contains(fText->char32At(tPos))) {
3122 continue;
3123 }
3124 }
3125
3126
3127
729e4ab9
A
3128 // LB 16 (CL | CP) SP* x NS
3129 // Scan backwards for SP* CM* (CL | CP)
73c04bcf
A
3130 if (fNS->contains(thisChar)) {
3131 int tPos = prevPos;
3132 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3133 tPos = fText->moveIndex32(tPos, -1);
3134 }
3135 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3136 tPos = fText->moveIndex32(tPos, -1);
3137 }
729e4ab9 3138 if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
73c04bcf
A
3139 continue;
3140 }
3141 }
3142
3143
3144 // LB 17 B2 SP* x B2
3145 if (fB2->contains(thisChar)) {
3146 // Scan backwards, checking for the B2 CM* SP* sequence.
3147 tPos = prevPos;
3148 if (fSP->contains(prevChar)) {
3149 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3150 tPos=fText->moveIndex32(tPos, -1);
3151 }
3152 }
3153 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3154 tPos=fText->moveIndex32(tPos, -1);
3155 }
3156 if (fB2->contains(fText->char32At(tPos))) {
3157 continue;
3158 }
3159 }
3160
46f4442e 3161
73c04bcf
A
3162 // LB 18 break after space
3163 if (fSP->contains(prevChar)) {
3164 break;
3165 }
3166
3167 // LB 19
3168 // x QU
3169 // QU x
3170 if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3171 continue;
3172 }
3173
3174 // LB 20 Break around a CB
3175 if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3176 break;
3177 }
3178
3179 // LB 21
3180 if (fBA->contains(thisChar) ||
3181 fHY->contains(thisChar) ||
3182 fNS->contains(thisChar) ||
3183 fBB->contains(prevChar) ) {
3184 continue;
3185 }
3186
4388f060
A
3187 // LB 21a
3188 // HL (HY | BA) x
3189 if (fHL->contains(prevCharX2) &&
3190 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3191 continue;
3192 }
3193
57a6839d
A
3194 // LB 21b
3195 // SY x HL
51004dcb
A
3196 if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3197 continue;
3198 }
3199
73c04bcf 3200 // LB 22
729e4ab9 3201 if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
4388f060 3202 (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
729e4ab9
A
3203 (fID->contains(prevChar) && fIN->contains(thisChar)) ||
3204 (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
3205 (fNU->contains(prevChar) && fIN->contains(thisChar)) ) {
73c04bcf
A
3206 continue;
3207 }
3208
3209
3210 // LB 23 ID x PO
3211 // AL x NU
4388f060 3212 // HL x NU
73c04bcf 3213 // NU x AL
729e4ab9
A
3214 if ((fID->contains(prevChar) && fPO->contains(thisChar)) ||
3215 (fAL->contains(prevChar) && fNU->contains(thisChar)) ||
4388f060
A
3216 (fHL->contains(prevChar) && fNU->contains(thisChar)) ||
3217 (fNU->contains(prevChar) && fAL->contains(thisChar)) ||
3218 (fNU->contains(prevChar) && fHL->contains(thisChar)) ) {
73c04bcf
A
3219 continue;
3220 }
3221
3222 // LB 24 Do not break between prefix and letters or ideographs.
3223 // PR x ID
4388f060
A
3224 // PR x (AL | HL)
3225 // PO x (AL | HL)
729e4ab9 3226 if ((fPR->contains(prevChar) && fID->contains(thisChar)) ||
4388f060
A
3227 (fPR->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) ||
3228 (fPO->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar)))) {
73c04bcf
A
3229 continue;
3230 }
46f4442e
A
3231
3232
3233
73c04bcf
A
3234 // LB 25 Numbers
3235 if (fNumberMatcher->lookingAt(prevPos, status)) {
3236 if (U_FAILURE(status)) {
3237 break;
3238 }
3239 // Matched a number. But could have been just a single digit, which would
3240 // not represent a "no break here" between prevChar and thisChar
3241 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num
3242 if (numEndIdx > pos) {
3243 // Number match includes at least our two chars being checked
3244 if (numEndIdx > nextPos) {
3245 // Number match includes additional chars. Update pos and nextPos
3246 // so that next loop iteration will continue at the end of the number,
3247 // checking for breaks between last char in number & whatever follows.
3248 pos = nextPos = numEndIdx;
3249 do {
3250 pos = fText->moveIndex32(pos, -1);
3251 thisChar = fText->char32At(pos);
3252 } while (fCM->contains(thisChar));
3253 }
3254 continue;
3255 }
3256 }
3257
3258
3259 // LB 26 Do not break a Korean syllable.
3260 if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3261 fJV->contains(thisChar) ||
3262 fH2->contains(thisChar) ||
3263 fH3->contains(thisChar))) {
3264 continue;
3265 }
3266
3267 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) &&
3268 (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3269 continue;
3270 }
3271
3272 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3273 fJT->contains(thisChar)) {
3274 continue;
3275 }
3276
3277 // LB 27 Treat a Korean Syllable Block the same as ID.
3278 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3279 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3280 fIN->contains(thisChar)) {
3281 continue;
3282 }
3283 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3284 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3285 fPO->contains(thisChar)) {
3286 continue;
3287 }
3288 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3289 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3290 continue;
3291 }
3292
3293
3294
46f4442e 3295 // LB 28 Do not break between alphabetics ("at").
4388f060 3296 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
73c04bcf
A
3297 continue;
3298 }
3299
3300 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g.").
4388f060 3301 if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
73c04bcf
A
3302 continue;
3303 }
3304
729e4ab9
A
3305 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3306 // (AL | NU) x OP
3307 // CP x (AL | NU)
4388f060 3308 if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
729e4ab9
A
3309 continue;
3310 }
4388f060 3311 if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
729e4ab9
A
3312 continue;
3313 }
3314
51004dcb
A
3315 // LB30a Do not break between regional indicators.
3316 // RI x RI
3317 if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3318 continue;
3319 }
3320
73c04bcf
A
3321 // LB 31 Break everywhere else
3322 break;
3323
3324 }
3325
3326 return pos;
3327}
3328
3329
3330UVector *RBBILineMonkey::charClasses() {
3331 return fSets;
3332}
3333
3334
3335RBBILineMonkey::~RBBILineMonkey() {
3336 delete fSets;
3337
3338 delete fBK;
3339 delete fCR;
3340 delete fLF;
3341 delete fCM;
3342 delete fNL;
3343 delete fWJ;
3344 delete fZW;
3345 delete fGL;
3346 delete fCB;
3347 delete fSP;
3348 delete fB2;
3349 delete fBA;
3350 delete fBB;
3351 delete fHY;
3352 delete fH2;
3353 delete fH3;
3354 delete fCL;
729e4ab9 3355 delete fCP;
73c04bcf
A
3356 delete fEX;
3357 delete fIN;
3358 delete fJL;
3359 delete fJV;
3360 delete fJT;
3361 delete fNS;
3362 delete fOP;
3363 delete fQU;
3364 delete fIS;
3365 delete fNU;
3366 delete fPO;
3367 delete fPR;
3368 delete fSY;
3369 delete fAI;
3370 delete fAL;
4388f060
A
3371 delete fCJ;
3372 delete fHL;
73c04bcf 3373 delete fID;
51004dcb 3374 delete fRI;
73c04bcf
A
3375 delete fSA;
3376 delete fSG;
3377 delete fXX;
3378
3379 delete fCharBI;
3380 delete fNumberMatcher;
3381}
3382
3383
3384//-------------------------------------------------------------------------------------------
3385//
3386// TestMonkey
3387//
3388// params
3389// seed=nnnnn Random number starting seed.
3390// Setting the seed allows errors to be reproduced.
3391// loop=nnn Looping count. Controls running time.
3392// -1: run forever.
3393// 0 or greater: run length.
3394//
3395// type = char | word | line | sent | title
3396//
3397//-------------------------------------------------------------------------------------------
3398
3399static int32_t getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3400 int32_t val = defaultVal;
3401 name.append(" *= *(-?\\d+)");
3402 UErrorCode status = U_ZERO_ERROR;
3403 RegexMatcher m(name, params, 0, status);
3404 if (m.find()) {
3405 // The param exists. Convert the string to an int.
3406 char valString[100];
3407 int32_t paramLength = m.end(1, status) - m.start(1, status);
3408 if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3409 paramLength = (int32_t)(sizeof(valString)-2);
3410 }
3411 params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3412 val = strtol(valString, NULL, 10);
3413
3414 // Delete this parameter from the params string.
3415 m.reset();
3416 params = m.replaceFirst("", status);
3417 }
3418 U_ASSERT(U_SUCCESS(status));
3419 return val;
3420}
3421#endif
3422
51004dcb 3423#if !UCONFIG_NO_REGULAR_EXPRESSIONS
73c04bcf
A
3424static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3425 BreakIterator *bi,
3426 int expected[],
3427 int expectedcount)
3428{
3429 int count = 0;
3430 int i = 0;
3431 int forward[50];
3432 bi->setText(ustr);
3433 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3434 forward[count] = i;
3435 if (count < expectedcount && expected[count] != i) {
3436 test->errln("break forward test failed: expected %d but got %d",
3437 expected[count], i);
3438 break;
3439 }
3440 count ++;
3441 }
3442 if (count != expectedcount) {
3443 printStringBreaks(ustr, expected, expectedcount);
3444 test->errln("break forward test failed: missed %d match",
3445 expectedcount - count);
3446 return;
3447 }
3448 // testing boundaries
3449 for (i = 1; i < expectedcount; i ++) {
3450 int j = expected[i - 1];
3451 if (!bi->isBoundary(j)) {
3452 printStringBreaks(ustr, expected, expectedcount);
3453 test->errln("isBoundary() failed. Expected boundary at position %d", j);
3454 return;
3455 }
3456 for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3457 if (bi->isBoundary(j)) {
3458 printStringBreaks(ustr, expected, expectedcount);
3459 test->errln("isBoundary() failed. Not expecting boundary at position %d", j);
3460 return;
3461 }
3462 }
3463 }
3464
3465 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3466 count --;
3467 if (forward[count] != i) {
51004dcb 3468 printStringBreaks(ustr, expected, expectedcount);
73c04bcf
A
3469 test->errln("happy break test previous() failed: expected %d but got %d",
3470 forward[count], i);
3471 break;
3472 }
3473 }
3474 if (count != 0) {
3475 printStringBreaks(ustr, expected, expectedcount);
3476 test->errln("break test previous() failed: missed a match");
3477 return;
3478 }
3479
3480 // testing preceding
3481 for (i = 0; i < expectedcount - 1; i ++) {
3482 // int j = expected[i] + 1;
3483 int j = ustr.moveIndex32(expected[i], 1);
3484 for (; j <= expected[i + 1]; j ++) {
3485 if (bi->preceding(j) != expected[i]) {
3486 printStringBreaks(ustr, expected, expectedcount);
3487 test->errln("preceding(): Not expecting boundary at position %d", j);
3488 return;
3489 }
3490 }
3491 }
3492}
51004dcb 3493#endif
73c04bcf
A
3494
3495void RBBITest::TestWordBreaks(void)
3496{
3497#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3498
73c04bcf
A
3499 Locale locale("en");
3500 UErrorCode status = U_ZERO_ERROR;
3501 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3502 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
51004dcb
A
3503 // Replaced any C+J characters in a row with a random sequence of characters
3504 // of the same length to make our C+J segmentation not get in the way.
73c04bcf
A
3505 static const char *strlist[] =
3506 {
3507 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
51004dcb 3508 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
46f4442e 3509 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
73c04bcf 3510 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
51004dcb 3511 "\\uac00\\u3588\\u009c\\u0953\\u194b",
73c04bcf
A
3512 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3513 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
51004dcb 3514 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
73c04bcf
A
3515 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3516 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3517 "\\u2027\\U000e0067\\u0a47\\u00b7",
3518 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3519 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3520 "\\u0589\\U000e006e\\u0a42\\U000104a5",
51004dcb 3521 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
73c04bcf
A
3522 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3523 "\\u0027\\u11af\\U000e0057\\u0602",
3524 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3525 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3526 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3527 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
46f4442e 3528 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
73c04bcf
A
3529 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3530 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3531 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3532 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
51004dcb 3533 "\\u18f4\\U000e0049\\u20e7\\u2027",
73c04bcf
A
3534 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3535 "\\ua183\\u102d\\u0bec\\u003a",
3536 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3537 "\\u003a\\u0e57\\u0fad\\u002e",
3538 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3539 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3540 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3541 "\\u003a\\u0664\\u00b7\\u1fba",
3542 "\\u003b\\u0027\\u00b7\\u47a3",
51004dcb 3543 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
73c04bcf
A
3544 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3545 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3546 };
3547 int loop;
3548 if (U_FAILURE(status)) {
729e4ab9 3549 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
73c04bcf
A
3550 return;
3551 }
3552 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3553 // printf("looping %d\n", loop);
46f4442e 3554 UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
73c04bcf
A
3555 // RBBICharMonkey monkey;
3556 RBBIWordMonkey monkey;
3557
3558 int expected[50];
3559 int expectedcount = 0;
3560
3561 monkey.setText(ustr);
3562 int i;
3563 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3564 expected[expectedcount ++] = i;
3565 }
3566
3567 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3568 }
3569 delete bi;
3570#endif
3571}
3572
3573void RBBITest::TestWordBoundary(void)
3574{
3575 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3576 Locale locale("en");
3577 UErrorCode status = U_ZERO_ERROR;
3578 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3579 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3580 UChar str[50];
3581 static const char *strlist[] =
3582 {
3583 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3584 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3585 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3586 "\\u2027\\U000e0067\\u0a47\\u00b7",
3587 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3588 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3589 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3590 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3591 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3592 "\\u0027\\u11af\\U000e0057\\u0602",
3593 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3594 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3595 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3596 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3597 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
51004dcb 3598 "\\U000e0065\\u302c\\u09ee\\U000e0068",
73c04bcf
A
3599 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3600 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3601 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3602 "\\u58f4\\U000e0049\\u20e7\\u2027",
51004dcb 3603 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
73c04bcf
A
3604 "\\ua183\\u102d\\u0bec\\u003a",
3605 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3606 "\\u003a\\u0e57\\u0fad\\u002e",
3607 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3608 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3609 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3610 "\\u003a\\u0664\\u00b7\\u1fba",
3611 "\\u003b\\u0027\\u00b7\\u47a3",
3612 };
3613 int loop;
3614 if (U_FAILURE(status)) {
729e4ab9 3615 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
73c04bcf
A
3616 return;
3617 }
3618 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3619 // printf("looping %d\n", loop);
3620 u_unescape(strlist[loop], str, 20);
3621 UnicodeString ustr(str);
3622 int forward[50];
3623 int count = 0;
3624
3625 bi->setText(ustr);
3626 int prev = 0;
3627 int i;
3628 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3629 forward[count ++] = i;
3630 if (i > prev) {
3631 int j;
3632 for (j = prev + 1; j < i; j ++) {
3633 if (bi->isBoundary(j)) {
3634 printStringBreaks(ustr, forward, count);
3635 errln("happy boundary test failed: expected %d not a boundary",
3636 j);
3637 return;
3638 }
3639 }
3640 }
3641 if (!bi->isBoundary(i)) {
3642 printStringBreaks(ustr, forward, count);
3643 errln("happy boundary test failed: expected %d a boundary",
3644 i);
3645 return;
3646 }
3647 prev = i;
3648 }
3649 }
3650 delete bi;
3651}
3652
3653void RBBITest::TestLineBreaks(void)
3654{
3655#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3656 Locale locale("en");
3657 UErrorCode status = U_ZERO_ERROR;
3658 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3659 const int32_t STRSIZE = 50;
3660 UChar str[STRSIZE];
3661 static const char *strlist[] =
3662 {
3663 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3664 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3665 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3666 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3667 "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3668 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3669 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3670 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3671 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3672 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3673 "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
3674 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3675 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3676 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3677 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3678 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3679 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3680 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3681 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3682 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3683 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3684 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3685 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3686 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3687 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3688 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3689 "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
3690 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3691 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3692 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3693 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3694 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3695 "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
3696 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3697 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3698 "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
3699 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3700 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3701 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3702 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3703 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3704 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3705 "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
3706 "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
3707 "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
3708 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3709 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3710 };
3711 int loop;
3712 TEST_ASSERT_SUCCESS(status);
3713 if (U_FAILURE(status)) {
3714 return;
3715 }
3716 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3717 // printf("looping %d\n", loop);
3718 int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3719 if (t >= STRSIZE) {
3720 TEST_ASSERT(FALSE);
3721 continue;
3722 }
3723
46f4442e 3724
73c04bcf
A
3725 UnicodeString ustr(str);
3726 RBBILineMonkey monkey;
3727 if (U_FAILURE(monkey.deferredStatus)) {
3728 continue;
3729 }
3730
3731 const int EXPECTEDSIZE = 50;
3732 int expected[EXPECTEDSIZE];
3733 int expectedcount = 0;
3734
3735 monkey.setText(ustr);
3736 int i;
3737 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3738 if (expectedcount >= EXPECTEDSIZE) {
3739 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3740 return;
3741 }
3742 expected[expectedcount ++] = i;
3743 }
3744
3745 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3746 }
3747 delete bi;
3748#endif
3749}
3750
3751void RBBITest::TestSentBreaks(void)
3752{
3753#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3754 Locale locale("en");
3755 UErrorCode status = U_ZERO_ERROR;
3756 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3757 UChar str[200];
3758 static const char *strlist[] =
3759 {
3760 "Now\ris\nthe\r\ntime\n\rfor\r\r",
3761 "This\n",
3762 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3763 "\"Sentence ending with a quote.\" Bye.",
3764 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
3765 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3766 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3767 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3768 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3769 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3770 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3771 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3772 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3773 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3774 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3775 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3776 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3777 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3778 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3779 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3780 };
3781 int loop;
3782 if (U_FAILURE(status)) {
729e4ab9 3783 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
73c04bcf
A
3784 return;
3785 }
3786 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3787 u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
3788 UnicodeString ustr(str);
3789
3790 RBBISentMonkey monkey;
3791 if (U_FAILURE(monkey.deferredStatus)) {
3792 continue;
3793 }
3794
3795 const int EXPECTEDSIZE = 50;
3796 int expected[EXPECTEDSIZE];
3797 int expectedcount = 0;
3798
3799 monkey.setText(ustr);
3800 int i;
3801 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3802 if (expectedcount >= EXPECTEDSIZE) {
3803 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3804 return;
3805 }
3806 expected[expectedcount ++] = i;
3807 }
3808
3809 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3810 }
3811 delete bi;
3812#endif
3813}
3814
3815void RBBITest::TestMonkey(char *params) {
3816#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3817
3818 UErrorCode status = U_ZERO_ERROR;
3819 int32_t loopCount = 500;
3820 int32_t seed = 1;
3821 UnicodeString breakType = "all";
3822 Locale locale("en");
3823 UBool useUText = FALSE;
3824
3825 if (quick == FALSE) {
3826 loopCount = 10000;
3827 }
3828
3829 if (params) {
3830 UnicodeString p(params);
3831 loopCount = getIntParam("loop", p, loopCount);
3832 seed = getIntParam("seed", p, seed);
3833
3834 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3835 if (m.find()) {
3836 breakType = m.group(1, status);
3837 m.reset();
3838 p = m.replaceFirst("", status);
3839 }
3840
3841 RegexMatcher u(" *utext", p, 0, status);
3842 if (u.find()) {
3843 useUText = TRUE;
3844 u.reset();
3845 p = u.replaceFirst("", status);
3846 }
3847
3848
3849 // m.reset(p);
46f4442e 3850 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
73c04bcf
A
3851 // Each option is stripped out of the option string as it is processed.
3852 // All options have been checked. The option string should have been completely emptied..
3853 char buf[100];
3854 p.extract(buf, sizeof(buf), NULL, status);
3855 buf[sizeof(buf)-1] = 0;
3856 errln("Unrecognized or extra parameter: %s\n", buf);
3857 return;
3858 }
3859
3860 }
3861
3862 if (breakType == "char" || breakType == "all") {
3863 RBBICharMonkey m;
3864 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3865 if (U_SUCCESS(status)) {
3866 RunMonkey(bi, m, "char", seed, loopCount, useUText);
3867 if (breakType == "all" && useUText==FALSE) {
3868 // Also run a quick test with UText when "all" is specified
3869 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
3870 }
3871 }
3872 else {
729e4ab9 3873 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
73c04bcf
A
3874 }
3875 delete bi;
3876 }
3877
3878 if (breakType == "word" || breakType == "all") {
3879 logln("Word Break Monkey Test");
3880 RBBIWordMonkey m;
3881 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3882 if (U_SUCCESS(status)) {
3883 RunMonkey(bi, m, "word", seed, loopCount, useUText);
3884 }
3885 else {
729e4ab9 3886 errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
73c04bcf
A
3887 }
3888 delete bi;
3889 }
3890
3891 if (breakType == "line" || breakType == "all") {
3892 logln("Line Break Monkey Test");
3893 RBBILineMonkey m;
3894 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3895 if (loopCount >= 10) {
3896 loopCount = loopCount / 5; // Line break runs slower than the others.
3897 }
3898 if (U_SUCCESS(status)) {
3899 RunMonkey(bi, m, "line", seed, loopCount, useUText);
3900 }
3901 else {
729e4ab9 3902 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
73c04bcf
A
3903 }
3904 delete bi;
3905 }
3906
46f4442e 3907 if (breakType == "sent" || breakType == "all" ) {
73c04bcf
A
3908 logln("Sentence Break Monkey Test");
3909 RBBISentMonkey m;
3910 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3911 if (loopCount >= 10) {
3912 loopCount = loopCount / 10; // Sentence runs slower than the other break types
3913 }
3914 if (U_SUCCESS(status)) {
3915 RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
3916 }
3917 else {
729e4ab9 3918 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
73c04bcf
A
3919 }
3920 delete bi;
3921 }
3922
3923#endif
3924}
3925
3926//
3927// Run a RBBI monkey test. Common routine, for all break iterator types.
3928// Parameters:
3929// bi - the break iterator to use
3930// mk - MonkeyKind, abstraction for obtaining expected results
3931// name - Name of test (char, word, etc.) for use in error messages
3932// seed - Seed for starting random number generator (parameter from user)
3933// numIterations
3934//
46f4442e 3935void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed,
73c04bcf
A
3936 int32_t numIterations, UBool useUText) {
3937
3938#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3939
3940 const int32_t TESTSTRINGLEN = 500;
3941 UnicodeString testText;
3942 int32_t numCharClasses;
3943 UVector *chClasses;
3944 int expected[TESTSTRINGLEN*2 + 1];
3945 int expectedCount = 0;
3946 char expectedBreaks[TESTSTRINGLEN*2 + 1];
3947 char forwardBreaks[TESTSTRINGLEN*2 + 1];
3948 char reverseBreaks[TESTSTRINGLEN*2+1];
3949 char isBoundaryBreaks[TESTSTRINGLEN*2+1];
3950 char followingBreaks[TESTSTRINGLEN*2+1];
3951 char precedingBreaks[TESTSTRINGLEN*2+1];
3952 int i;
3953 int loopCount = 0;
3954
3955 m_seed = seed;
3956
3957 numCharClasses = mk.charClasses()->size();
3958 chClasses = mk.charClasses();
3959
3960 // Check for errors that occured during the construction of the MonkeyKind object.
3961 // Can't report them where they occured because errln() is a method coming from intlTest,
3962 // and is not visible outside of RBBITest :-(
3963 if (U_FAILURE(mk.deferredStatus)) {
3964 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
3965 return;
3966 }
3967
3968 // Verify that the character classes all have at least one member.
3969 for (i=0; i<numCharClasses; i++) {
3970 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
3971 if (s == NULL || s->size() == 0) {
3972 errln("Character Class #%d is null or of zero size.", i);
3973 return;
3974 }
3975 }
3976
3977 while (loopCount < numIterations || numIterations == -1) {
3978 if (numIterations == -1 && loopCount % 10 == 0) {
3979 // If test is running in an infinite loop, display a periodic tic so
3980 // we can tell that it is making progress.
3981 fprintf(stderr, ".");
3982 }
3983 // Save current random number seed, so that we can recreate the random numbers
3984 // for this loop iteration in event of an error.
3985 seed = m_seed;
3986
3987 // Populate a test string with data.
3988 testText.truncate(0);
3989 for (i=0; i<TESTSTRINGLEN; i++) {
3990 int32_t aClassNum = m_rand() % numCharClasses;
3991 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
3992 int32_t charIdx = m_rand() % classSet->size();
3993 UChar32 c = classSet->charAt(charIdx);
3994 if (c < 0) { // TODO: deal with sets containing strings.
3995 errln("c < 0");
3996 break;
3997 }
3998 testText.append(c);
3999 }
4000
4001 // Calculate the expected results for this test string.
4002 mk.setText(testText);
4003 memset(expectedBreaks, 0, sizeof(expectedBreaks));
4004 expectedBreaks[0] = 1;
4005 int32_t breakPos = 0;
4006 expectedCount = 0;
4007 for (;;) {
4008 breakPos = mk.next(breakPos);
4009 if (breakPos == -1) {
4010 break;
4011 }
4012 if (breakPos > testText.length()) {
4013 errln("breakPos > testText.length()");
4014 }
4015 expectedBreaks[breakPos] = 1;
4016 U_ASSERT(expectedCount<testText.length());
4017 expected[expectedCount ++] = breakPos;
57a6839d
A
4018 (void)expected; // Set but not used warning.
4019 // TODO (andy): check it out.
73c04bcf
A
4020 }
4021
4022 // Find the break positions using forward iteration
4023 memset(forwardBreaks, 0, sizeof(forwardBreaks));
4024 if (useUText) {
4025 UErrorCode status = U_ZERO_ERROR;
4026 UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4027 // testUText = utext_openUnicodeString(testUText, &testText, &status);
4028 bi->setText(testUText, status);
4029 TEST_ASSERT_SUCCESS(status);
4030 utext_close(testUText); // The break iterator does a shallow clone of the UText
4031 // This UText can be closed immediately, so long as the
4032 // testText string continues to exist.
4033 } else {
4034 bi->setText(testText);
4035 }
4036
4037 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4038 if (i < 0 || i > testText.length()) {
4039 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4040 break;
4041 }
4042 forwardBreaks[i] = 1;
4043 }
4044
4045 // Find the break positions using reverse iteration
4046 memset(reverseBreaks, 0, sizeof(reverseBreaks));
4047 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4048 if (i < 0 || i > testText.length()) {
4049 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4050 break;
4051 }
4052 reverseBreaks[i] = 1;
4053 }
4054
4055 // Find the break positions using isBoundary() tests.
4056 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4057 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4058 for (i=0; i<=testText.length(); i++) {
4059 isBoundaryBreaks[i] = bi->isBoundary(i);
4060 }
4061
4062
4063 // Find the break positions using the following() function.
4064 // printf(".");
4065 memset(followingBreaks, 0, sizeof(followingBreaks));
4066 int32_t lastBreakPos = 0;
4067 followingBreaks[0] = 1;
4068 for (i=0; i<testText.length(); i++) {
4069 breakPos = bi->following(i);
4070 if (breakPos <= i ||
4071 breakPos < lastBreakPos ||
4072 breakPos > testText.length() ||
729e4ab9 4073 (breakPos > lastBreakPos && lastBreakPos > i)) {
51004dcb
A
4074 UChar32 brkChar = testText.char32At(lastBreakPos);
4075 if ((strcmp(name, "char") != 0 && strcmp(name, "word") != 0) || brkChar < 0x1F1E6 || brkChar > 0x1F1FF) { // Apple, skip RI char/word break monkey tests
73c04bcf
A
4076 errln("%s break monkey test: "
4077 "Out of range value returned by BreakIterator::following().\n"
4078 "Random seed=%d index=%d; following returned %d; lastbreak=%d",
4079 name, seed, i, breakPos, lastBreakPos);
51004dcb 4080 }
73c04bcf
A
4081 break;
4082 }
4083 followingBreaks[breakPos] = 1;
4084 lastBreakPos = breakPos;
4085 }
4086
4087 // Find the break positions using the preceding() function.
46f4442e 4088 memset(precedingBreaks, 0, sizeof(precedingBreaks));
73c04bcf
A
4089 lastBreakPos = testText.length();
4090 precedingBreaks[testText.length()] = 1;
4091 for (i=testText.length(); i>0; i--) {
4092 breakPos = bi->preceding(i);
4093 if (breakPos >= i ||
4094 breakPos > lastBreakPos ||
729e4ab9
A
4095 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4096 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
51004dcb
A
4097 UChar32 brkChar = testText.char32At(breakPos);
4098 if ((strcmp(name, "char") != 0 && strcmp(name, "word") != 0) || brkChar < 0x1F1E6 || brkChar > 0x1F1FF) { // Apple, skip RI char/word break monkey tests
73c04bcf
A
4099 errln("%s break monkey test: "
4100 "Out of range value returned by BreakIterator::preceding().\n"
4101 "index=%d; prev returned %d; lastBreak=%d" ,
4102 name, i, breakPos, lastBreakPos);
46f4442e
A
4103 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4104 precedingBreaks[i] = 2; // Forces an error.
4105 }
51004dcb 4106 }
73c04bcf 4107 } else {
46f4442e
A
4108 if (breakPos >= 0) {
4109 precedingBreaks[breakPos] = 1;
4110 }
73c04bcf
A
4111 lastBreakPos = breakPos;
4112 }
4113 }
4114
4115 // Compare the expected and actual results.
4116 for (i=0; i<=testText.length(); i++) {
4117 const char *errorType = NULL;
4118 if (forwardBreaks[i] != expectedBreaks[i]) {
4119 errorType = "next()";
4120 } else if (reverseBreaks[i] != forwardBreaks[i]) {
4121 errorType = "previous()";
4122 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4123 errorType = "isBoundary()";
4124 } else if (followingBreaks[i] != expectedBreaks[i]) {
4125 errorType = "following()";
4126 } else if (precedingBreaks[i] != expectedBreaks[i]) {
4127 errorType = "preceding()";
4128 }
4129
4130
4131 if (errorType != NULL) {
4132 // Format a range of the test text that includes the failure as
4133 // a data item that can be included in the rbbi test data file.
4134
4135 // Start of the range is the last point where expected and actual results
4136 // both agreed that there was a break position.
4137 int startContext = i;
4138 int32_t count = 0;
4139 for (;;) {
4140 if (startContext==0) { break; }
4141 startContext --;
4142 if (expectedBreaks[startContext] != 0) {
4143 if (count == 2) break;
4144 count ++;
4145 }
4146 }
4147
4148 // End of range is two expected breaks past the start position.
4149 int endContext = i + 1;
4150 int ci;
4151 for (ci=0; ci<2; ci++) { // Number of items to include in error text.
4152 for (;;) {
4153 if (endContext >= testText.length()) {break;}
4154 if (expectedBreaks[endContext-1] != 0) {
4155 if (count == 0) break;
4156 count --;
4157 }
4158 endContext ++;
4159 }
4160 }
4161
4162 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4163 UnicodeString errorText = "<data>";
4164 /***if (strcmp(errorType, "next()") == 0) {
4165 startContext = 0;
4166 endContext = testText.length();
4167
4168 printStringBreaks(testText, expected, expectedCount);
4169 }***/
4170
4171 for (ci=startContext; ci<endContext;) {
4172 UnicodeString hexChars("0123456789abcdef");
4173 UChar32 c;
4174 int bn;
4175 c = testText.char32At(ci);
4176 if (ci == i) {
4177 // This is the location of the error.
4178 errorText.append("<?>");
4179 } else if (expectedBreaks[ci] != 0) {
4180 // This a non-error expected break position.
4181 errorText.append("\\");
4182 }
4183 if (c < 0x10000) {
4184 errorText.append("\\u");
4185 for (bn=12; bn>=0; bn-=4) {
4186 errorText.append(hexChars.charAt((c>>bn)&0xf));
4187 }
4188 } else {
4189 errorText.append("\\U");
4190 for (bn=28; bn>=0; bn-=4) {
4191 errorText.append(hexChars.charAt((c>>bn)&0xf));
4192 }
4193 }
4194 ci = testText.moveIndex32(ci, 1);
4195 }
4196 errorText.append("\\");
4197 errorText.append("</data>\n");
4198
4199 // Output the error
4200 char charErrorTxt[500];
4201 UErrorCode status = U_ZERO_ERROR;
4202 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4203 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4388f060
A
4204 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
4205
51004dcb
A
4206 UChar32 brkChar = testText.char32At(i);
4207 if ((strcmp(name, "char") != 0 && strcmp(name, "word") != 0) || brkChar < 0x1F1E6 || brkChar > 0x1F1FF) { // Apple, skip RI char/word break monkey tests
4388f060
A
4208 errln("%s break monkey test error [%s]. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s",
4209 name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
73c04bcf 4210 errorType, seed, i, charErrorTxt);
51004dcb 4211 }
73c04bcf
A
4212 break;
4213 }
4214 }
4215
4216 loopCount++;
4217 }
4218#endif
4219}
4220
729e4ab9
A
4221
4222// Bug 5532. UTF-8 based UText fails in dictionary code.
4223// This test checks the initial patch,
4224// which is to just keep it from crashing. Correct word boundaries
4225// await a proper fix to the dictionary code.
4226//
4227void RBBITest::TestBug5532(void) {
4228 // Text includes a mixture of Thai and Latin.
4229 const unsigned char utf8Data[] = {
4230 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4231 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4232 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4233 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4234 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4235 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4236 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4237 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4238 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4239 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4240 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4241
4242 UErrorCode status = U_ZERO_ERROR;
4243 UText utext=UTEXT_INITIALIZER;
4244 utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4245 TEST_ASSERT_SUCCESS(status);
4246
4247 BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4248 TEST_ASSERT_SUCCESS(status);
4249 if (U_SUCCESS(status)) {
4250 bi->setText(&utext, status);
4251 TEST_ASSERT_SUCCESS(status);
4252
4253 int32_t breakCount = 0;
4254 int32_t previousBreak = -1;
4255 for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4256 // For now, just make sure that the break iterator doesn't hang.
4257 TEST_ASSERT(previousBreak < bi->current());
4258 previousBreak = bi->current();
4259 }
4260 TEST_ASSERT(breakCount > 0);
4261 }
4262 delete bi;
4263 utext_close(&utext);
4264}
4265
4266
51004dcb
A
4267void RBBITest::TestBug9983(void) {
4268 UnicodeString text = UnicodeString("\\u002A" // * Other
4269 "\\uFF65" // Other
4270 "\\u309C" // Katakana
4271 "\\uFF9F" // Extend
4272 "\\uFF65" // Other
4273 "\\u0020" // Other
4274 "\\u0000").unescape();
4275
4276 UErrorCode status = U_ZERO_ERROR;
4277 LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4278 BreakIterator::createWordInstance(Locale::getRoot(), status)));
4279 TEST_ASSERT_SUCCESS(status);
57a6839d
A
4280 LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4281 BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4282 TEST_ASSERT_SUCCESS(status);
51004dcb
A
4283 if (U_FAILURE(status)) {
4284 return;
4285 }
57a6839d
A
4286 int32_t offset, rstatus, iterationCount;
4287
51004dcb 4288 brkiter->setText(text);
51004dcb 4289 brkiter->last();
57a6839d 4290 iterationCount = 0;
51004dcb
A
4291 while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4292 iterationCount++;
4293 rstatus = brkiter->getRuleStatus();
57a6839d
A
4294 (void)rstatus; // Suppress set but not used warning.
4295 if (iterationCount >= 10) {
4296 break;
4297 }
4298 }
4299 TEST_ASSERT(iterationCount == 6);
4300
4301 brkiterPOSIX->setText(text);
4302 brkiterPOSIX->last();
4303 iterationCount = 0;
4304 while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4305 iterationCount++;
4306 rstatus = brkiterPOSIX->getRuleStatus();
4307 (void)rstatus; // Suppress set but not used warning.
51004dcb
A
4308 if (iterationCount >= 10) {
4309 break;
4310 }
4311 }
4312 TEST_ASSERT(iterationCount == 6);
4313}
4314
4315
73c04bcf
A
4316//
4317// TestDebug - A place-holder test for debugging purposes.
4318// For putting in fragments of other tests that can be invoked
4319// for tracing without a lot of unwanted extra stuff happening.
4320//
4321void RBBITest::TestDebug(void) {
4322#if 0
4323 UErrorCode status = U_ZERO_ERROR;
4324 int pos = 0;
4325 int ruleStatus = 0;
4326
4327 RuleBasedBreakIterator* bi =
4328 // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
4329 // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
4330 (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
4331 UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
4332 // UnicodeString s("Aaa. Bcd");
4333 s = s.unescape();
4334 bi->setText(s);
4335 UBool r = bi->isBoundary(8);
4336 printf("%s", r?"true":"false");
4337 return;
4338 pos = bi->last();
4339 do {
4340 // ruleStatus = bi->getRuleStatus();
4341 printf("%d\t%d\n", pos, ruleStatus);
4342 pos = bi->previous();
4343 } while (pos != BreakIterator::DONE);
4344#endif
4345}
4346
4388f060
A
4347void RBBITest::TestProperties() {
4348 UErrorCode errorCode = U_ZERO_ERROR;
4349 UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4350 if (!prependSet.isEmpty()) {
4351 errln(
4352 "[:GCB=Prepend:] is not empty any more. "
4353 "Uncomment relevant lines in source/data/brkitr/char.txt and "
4354 "change this test to the opposite condition.");
4355 }
4356}
4357
73c04bcf 4358#endif /* #if !UCONFIG_NO_BREAK_ITERATION */