]> git.saurik.com Git - apple/icu.git/blame - icuSources/test/intltest/rbbitst.cpp
ICU-400.42.tar.gz
[apple/icu.git] / icuSources / test / intltest / rbbitst.cpp
CommitLineData
73c04bcf
A
1/********************************************************************
2 * COPYRIGHT:
46f4442e 3 * Copyright (c) 1999-2008, International Business Machines Corporation and
73c04bcf
A
4 * others. All Rights Reserved.
5 ********************************************************************/
6/************************************************************************
7* Date Name Description
8* 12/15/99 Madhu Creation.
9* 01/12/2000 Madhu Updated for changed API and added new tests
10************************************************************************/
11
12#include "unicode/utypes.h"
13
14#if !UCONFIG_NO_BREAK_ITERATION
15
16#include "unicode/utypes.h"
17#include "unicode/brkiter.h"
18#include "unicode/rbbi.h"
19#include "unicode/uchar.h"
20#include "unicode/utf16.h"
21#include "unicode/ucnv.h"
22#include "unicode/schriter.h"
23#include "unicode/uniset.h"
24#include "unicode/regex.h" // TODO: make conditional on regexp being built.
25#include "unicode/ustring.h"
26#include "unicode/utext.h"
27#include "intltest.h"
28#include "rbbitst.h"
29#include <string.h>
30#include "uvector.h"
31#include "uvectr32.h"
32#include "triedict.h"
33#include <string.h>
34#include <stdio.h>
35#include <stdlib.h>
36
37#define TEST_ASSERT(x) {if (!(x)) { \
38 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
39
46f4442e 40#define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
73c04bcf
A
41 errln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
42
43
46f4442e
A
44//---------------------------------------------
45// runIndexedTest
46//---------------------------------------------
47
48void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
49{
50 if (exec) logln("TestSuite RuleBasedBreakIterator: ");
51
52 switch (index) {
53 case 0: name = "TestBug4153072";
54 if(exec) TestBug4153072(); break;
55 case 1: name = "TestJapaneseLineBreak";
56 if(exec) TestJapaneseLineBreak(); break;
57 case 2: name = "TestStatusReturn";
58 if(exec) TestStatusReturn(); break;
59 case 3: name = "TestUnicodeFiles";
60 if(exec) TestUnicodeFiles(); break;
61 case 4: name = "TestEmptyString";
62 if(exec) TestEmptyString(); break;
63
64 case 5: name = "TestGetAvailableLocales";
65 if(exec) TestGetAvailableLocales(); break;
66
67 case 6: name = "TestGetDisplayName";
68 if(exec) TestGetDisplayName(); break;
69
70 case 7: name = "TestEndBehaviour";
71 if(exec) TestEndBehaviour(); break;
72 case 8: name = "TestMixedThaiLineBreak";
73 if(exec) TestMixedThaiLineBreak(); break;
74 case 9: name = "TestThaiLineBreak";
75 if(exec) TestThaiLineBreak(); break;
76 case 10: name = "TestMaiyamok";
77 if(exec) TestMaiyamok(); break;
78 case 11: name = "TestWordBreaks";
79 if(exec) TestWordBreaks(); break;
80 case 12: name = "TestWordBoundary";
81 if(exec) TestWordBoundary(); break;
82 case 13: name = "TestLineBreaks";
83 if(exec) TestLineBreaks(); break;
84 case 14: name = "TestSentBreaks";
85 if(exec) TestSentBreaks(); break;
86 case 15: name = "TestExtended";
87 if(exec) TestExtended(); break;
88 case 16: name = "TestMonkey";
89 if(exec) {
90 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
91 TestMonkey(params);
92 #else
93 logln("skipping TestMonkey (UCONFIG_NO_REGULAR_EXPRESSIONS)");
94 #endif
95 }
96 break;
97 case 17: name = "TestBug3818";
98 if(exec) TestBug3818(); break;
99 case 18: name = "TestJapaneseWordBreak";
100 if(exec) TestJapaneseWordBreak(); break;
101 case 19: name = "TestDebug";
102 if(exec) TestDebug(); break;
103 case 20: name = "TestTrieDict";
104 if(exec) TestTrieDict(); break;
105 case 21: name = "TestBug5775";
106 if (exec) TestBug5775(); break;
107 case 22: name = "TestThaiBreaks";
108 if (exec) TestThaiBreaks(); break;
109
110 default: name = ""; break; //needed to end loop
111 }
112}
113
114
73c04bcf
A
115//---------------------------------------------------------------------------
116//
117// class BITestData Holds a set of Break iterator test data and results
118// Includes
119// - the string data to be broken
120// - a vector of the expected break positions.
121// - a vector of source line numbers for the data,
122// (to help see where errors occured.)
123// - The expected break tag values.
124// - Vectors of actual break positions and tag values.
125// - Functions for comparing actual with expected and
126// reporting errors.
127//
128//----------------------------------------------------------------------------
129class BITestData {
130public:
131 UnicodeString fDataToBreak;
132 UVector fExpectedBreakPositions;
133 UVector fExpectedTags;
134 UVector fLineNum;
135 UVector fActualBreakPositions; // Test Results.
136 UVector fActualTags;
137
138 BITestData(UErrorCode &status);
139 void addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
140 void checkResults(const char *heading, RBBITest *test);
141 void err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
142 void clearResults();
143};
144
145//
146// Constructor.
147//
148BITestData::BITestData(UErrorCode &status)
149: fExpectedBreakPositions(status), fExpectedTags(status), fLineNum(status), fActualBreakPositions(status),
150 fActualTags(status)
151{
152}
153
154//
155// addDataChunk. Add a section (non-breaking) piece if data to the test data.
156// The macro form collects the line number, which is helpful
157// when tracking down failures.
158//
159// A null data item is inserted at the start of each test's data
160// to put the starting zero into the data list. The position saved for
161// each non-null item is its ending position.
162//
163#define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status);
164void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
165 if (U_FAILURE(status)) {return;}
166 if (data != NULL) {
167 fDataToBreak.append(CharsToUnicodeString(data));
168 }
169 fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
170 fExpectedTags.addElement(tag, status);
171 fLineNum.addElement(lineNum, status);
172}
173
174
175//
176// checkResults. Compare the actual and expected break positions, report any differences.
177//
178void BITestData::checkResults(const char *heading, RBBITest *test) {
179 int32_t expectedIndex = 0;
180 int32_t actualIndex = 0;
181
182 for (;;) {
183 // If we've run through both the expected and actual results vectors, we're done.
184 // break out of the loop.
185 if (expectedIndex >= fExpectedBreakPositions.size() &&
186 actualIndex >= fActualBreakPositions.size()) {
187 break;
188 }
189
190
191 if (expectedIndex >= fExpectedBreakPositions.size()) {
192 err(heading, test, expectedIndex-1, actualIndex);
193 actualIndex++;
194 continue;
195 }
196
197 if (actualIndex >= fActualBreakPositions.size()) {
198 err(heading, test, expectedIndex, actualIndex-1);
199 expectedIndex++;
200 continue;
201 }
202
203 if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
204 err(heading, test, expectedIndex, actualIndex);
205 // Try to resync the positions of the indices, to avoid a rash of spurious erros.
206 if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
207 actualIndex++;
208 } else {
209 expectedIndex++;
210 }
211 continue;
212 }
213
214 if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
215 test->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d",
216 heading, fLineNum.elementAt(expectedIndex),
217 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
218 }
219
220 actualIndex++;
221 expectedIndex++;
222 }
223}
224
225//
226// err - An error was found. Report it, along with information about where the
227// incorrectly broken test data appeared in the source file.
228//
229void BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
230{
231 int32_t expected = fExpectedBreakPositions.elementAti(expectedIdx);
232 int32_t actual = fActualBreakPositions.elementAti(actualIdx);
233 int32_t o = 0;
234 int32_t line = fLineNum.elementAti(expectedIdx);
235 if (expectedIdx > 0) {
236 // The line numbers are off by one because a premature break occurs somewhere
237 // within the previous item, rather than at the start of the current (expected) item.
238 // We want to report the offset of the unexpected break from the start of
239 // this previous item.
240 o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
241 }
242 if (actual < expected) {
46f4442e 243 test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading, o, line, actual, expected);
73c04bcf 244 } else {
46f4442e 245 test->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading, line, actual, expected);
73c04bcf
A
246 }
247}
248
249
250void BITestData::clearResults() {
251 fActualBreakPositions.removeAllElements();
252 fActualTags.removeAllElements();
253}
254
255
256//-----------------------------------------------------------------------------------
257//
258// Cannned Test Characters
259//
260//-----------------------------------------------------------------------------------
261
262static const UChar cannedTestArray[] = {
263 0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031,
264 0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b,
265 0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2,
266 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3,
267 0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303,
268 0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000,
269 0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f,
270 0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000
271};
272
273static UnicodeString* cannedTestChars = 0;
274
275#define halfNA "\\u0928\\u094d\\u200d"
276#define halfSA "\\u0938\\u094d\\u200d"
277#define halfCHA "\\u091a\\u094d\\u200d"
278#define halfKA "\\u0915\\u094d\\u200d"
279#define deadTA "\\u0924\\u094d"
280
281//--------------------------------------------------------------------------------------
282//
283// RBBITest constructor and destructor
284//
285//--------------------------------------------------------------------------------------
286
287RBBITest::RBBITest() {
288 UnicodeString temp(cannedTestArray);
289 cannedTestChars = new UnicodeString();
290 *cannedTestChars += (UChar)0x0000;
291 *cannedTestChars += temp;
292}
293
294
295RBBITest::~RBBITest() {
296 delete cannedTestChars;
297}
298
299
300static const int T_NUMBER = 100;
301static const int T_LETTER = 200;
302static const int T_H_OR_K = 300;
303static const int T_IDEO = 400;
304
305
306
307
308
309
310//--------------------------------------------------------------------
311//Testing the BreakIterator for devanagari script
312//--------------------------------------------------------------------
313
314#define deadRA "\\u0930\\u094d" /*deadform RA = devanagari RA + virama*/
315#define deadPHA "\\u092b\\u094d" /*deadform PHA = devanagari PHA + virama*/
316#define deadTTHA "\\u0920\\u094d"
317#define deadPA "\\u092a\\u094d"
318#define deadSA "\\u0938\\u094d"
319#define visarga "\\u0903" /*devanagari visarga looks like a english colon*/
320
321
322
323
324
325
326//-----------------------------------------------------------------------------------
327//
328// Test for status {tag} return value from break rules.
329// TODO: a more thorough test.
330//
331//-----------------------------------------------------------------------------------
332void RBBITest::TestStatusReturn() {
46f4442e 333 UnicodeString rulesString1("$Letters = [:L:];\n"
73c04bcf
A
334 "$Numbers = [:N:];\n"
335 "$Letters+{1};\n"
336 "$Numbers+{2};\n"
337 "Help\\ {4}/me\\!;\n"
338 "[^$Letters $Numbers];\n"
46f4442e 339 "!.*;\n", -1, US_INV);
73c04bcf
A
340 UnicodeString testString1 = "abc123..abc Help me Help me!";
341 // 01234567890123456789012345678
342 int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
343 int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1};
344
345 UErrorCode status=U_ZERO_ERROR;
346 UParseError parseError;
347
348 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
349 if(U_FAILURE(status)) {
350 errln("FAIL : in construction");
351 } else {
352 int32_t pos;
353 int32_t i = 0;
354 bi->setText(testString1);
355 for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
356 if (pos != bounds1[i]) {
357 errln("FAIL: expected break at %d, got %d\n", bounds1[i], pos);
358 break;
359 }
360
361 int tag = bi->getRuleStatus();
362 if (tag != brkStatus[i]) {
363 errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
364 break;
365 }
366 i++;
367 }
368 }
369 delete bi;
370}
371
372
373static void printStringBreaks(UnicodeString ustr, int expected[],
374 int expectedcount)
375{
376 UErrorCode status = U_ZERO_ERROR;
377 char name[100];
378 printf("code alpha extend alphanum type word sent line name\n");
379 int j;
380 for (j = 0; j < ustr.length(); j ++) {
381 if (expectedcount > 0) {
382 int k;
383 for (k = 0; k < expectedcount; k ++) {
384 if (j == expected[k]) {
385 printf("------------------------------------------------ %d\n",
386 j);
387 }
388 }
389 }
390 UChar32 c = ustr.char32At(j);
391 if (c > 0xffff) {
392 j ++;
393 }
394 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
395 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
396 u_isUAlphabetic(c),
397 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
398 u_isalnum(c),
399 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
400 u_charType(c),
401 U_SHORT_PROPERTY_NAME),
402 u_getPropertyValueName(UCHAR_WORD_BREAK,
403 u_getIntPropertyValue(c,
404 UCHAR_WORD_BREAK),
405 U_SHORT_PROPERTY_NAME),
406 u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
407 u_getIntPropertyValue(c,
408 UCHAR_SENTENCE_BREAK),
409 U_SHORT_PROPERTY_NAME),
410 u_getPropertyValueName(UCHAR_LINE_BREAK,
411 u_getIntPropertyValue(c,
412 UCHAR_LINE_BREAK),
413 U_SHORT_PROPERTY_NAME),
414 name);
415 }
416}
417
418void RBBITest::TestThaiLineBreak() {
419 UErrorCode status = U_ZERO_ERROR;
420 BITestData thaiLineSelection(status);
421
422 // \u0e2f-- the Thai paiyannoi character-- isn't a letter. It's a symbol that
423 // represents elided letters at the end of a long word. It should be bound to
424 // the end of the word and not treated as an independent punctuation mark.
425
426
427 ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data
428 ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f", 0, status);
429 ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e30", 0, status);
430 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e21", 0, status);
431 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e08\\u0e49\\u0e32", 0, status);
432// ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32", 0, status);
433// ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
434 ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48", 0, status);
435 // the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us
436 ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e2d\\u0e01", 0, status);
437 ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32", 0, status);
438 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e23\\u0e48\\u0e07", 0, status);
439 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22", 0, status);
440 ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07", 0, status);
441 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e15\\u0e47\\u0e21", 0, status);
442
443 // the one time where the paiyannoi occurs somewhere other than at the end
444 // of a word is in the Thai abbrevation for "etc.", which both begins and
445 // ends with a paiyannoi
446 ADD_DATACHUNK(thaiLineSelection, "\\u0e2f\\u0e25\\u0e2f", 0, status);
447 ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
448 ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19", 0, status);
449
450 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
451 Locale("th"), status);
452 if (U_FAILURE(status))
453 {
454 errln("Failed to create the BreakIterator for Thai locale in TestThaiLineBreak.\n");
455 return;
456 }
457
458 generalIteratorTest(*e, thaiLineSelection);
459 delete e;
460}
461
462
463
464void RBBITest::TestMixedThaiLineBreak()
465{
466 UErrorCode status = U_ZERO_ERROR;
467 BITestData thaiLineSelection(status);
468
469 ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data
470
471
472 // @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters
473 // start
474
475 ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
476 ADD_DATACHUNK(thaiLineSelection, "\\u0E1E\\u0E38\\u0E17\\u0E18\\u0E28\\u0E31\\u0E01\\u0E23\\u0E32\\u0E0A ", 0, status);
477 ADD_DATACHUNK(thaiLineSelection, "2545 ", 0, status);
478 ADD_DATACHUNK(thaiLineSelection, "\\u0E40\\u0E1B\\u0E47\\u0E19", 0, status);
479 ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
480 ADD_DATACHUNK(thaiLineSelection, "\\u0E09\\u0E25\\u0E2D\\u0E07", 0, status);
481 ADD_DATACHUNK(thaiLineSelection, "\\u0E04\\u0E23\\u0E1A", 0, status);
482 ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E2D\\u0E1A ", 0, status);
483 ADD_DATACHUNK(thaiLineSelection, "\"\\u0E52\\u0E52\\u0E50 ", 0, status);
484 ADD_DATACHUNK(thaiLineSelection, "\\u0E1b\\u0E35\" ", 0, status);
485 ADD_DATACHUNK(thaiLineSelection, "\\u0E02\\u0E2d\\u0E07", 0, status);
486 ADD_DATACHUNK(thaiLineSelection, "\\u0E01\\u0E23\\u0E38\\u0E07", 0, status);
487 ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E31\\u0E15\\u0E19\\u0E42\\u0E01\\u0E2A\\u0E34\\u0E19\\u0E17\\u0E23\\u0E4C ", 0, status);
488 ADD_DATACHUNK(thaiLineSelection, "(\\u0E01\\u0E23\\u0E38\\u0E07\\u0E40\\u0E17\\u0E1e\\u0E2F", 0, status);
489 ADD_DATACHUNK(thaiLineSelection, "\\u0E2B\\u0E23\\u0E37\\u0E2D ", 0, status);
490 ADD_DATACHUNK(thaiLineSelection, "Bangkok)", 0, status);
491
492 // @suwit - end of changes
493
494
495 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale("th"), status);
496 if (U_FAILURE(status))
497 {
498 errln("Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak.\n");
499 return;
500 }
501
502
503 generalIteratorTest(*e, thaiLineSelection);
504 delete e;
505}
506
507
508void RBBITest::TestMaiyamok()
509{
510 UErrorCode status = U_ZERO_ERROR;
511 BITestData thaiLineSelection(status);
512 ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data
513 // the Thai maiyamok character is a shorthand symbol that means "repeat the previous
514 // word". Instead of appearing as a word unto itself, however, it's kept together
515 // with the word before it
516 ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e1b\\u0e46", 0, status);
517 ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32\\u0e46", 0, status);
518 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07", 0, status);
519 ADD_DATACHUNK(thaiLineSelection, "\\u0e01\\u0e23\\u0e38\\u0e07", 0, status);
520 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e17\\u0e1e", 0, status);
521 ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e25\\u0e30", 0, status);
522 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e03\\u0e35", 0, status);
523 ADD_DATACHUNK(thaiLineSelection, "\\u0e22\\u0e07", 0, status);
524 ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status);
525
526 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
527 Locale("th"), status);
528
529 if (U_FAILURE(status))
530 {
531 errln("Failed to create the BreakIterator for Thai locale in TestMaiyamok.\n");
532 return;
533 }
534 generalIteratorTest(*e, thaiLineSelection);
535 delete e;
536}
537
538
539
540void RBBITest::TestBug3818() {
541 UErrorCode status = U_ZERO_ERROR;
542
543 // Four Thai words...
544 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
545 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
546 UnicodeString thaiStr(thaiWordData);
547
548 RuleBasedBreakIterator* bi =
549 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status);
550 if (U_FAILURE(status) || bi == NULL) {
551 errln("Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
552 return;
553 }
554 bi->setText(thaiStr);
555
556 int32_t startOfSecondWord = bi->following(1);
557 if (startOfSecondWord != 4) {
558 errln("Fail at file %s, line %d expected start of word at 4, got %d",
559 __FILE__, __LINE__, startOfSecondWord);
560 }
561 startOfSecondWord = bi->following(0);
562 if (startOfSecondWord != 4) {
563 errln("Fail at file %s, line %d expected start of word at 4, got %d",
564 __FILE__, __LINE__, startOfSecondWord);
565 }
566 delete bi;
567}
568
569
570void RBBITest::TestJapaneseWordBreak() {
571 UErrorCode status = U_ZERO_ERROR;
572 BITestData japaneseWordSelection(status);
573
574 ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status); // Break at start of data
575 ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); //2
576 ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status); //5
577 ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); //7
578 ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status); //10
579 ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); //11
580 ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status); //12
581
582 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(
583 Locale("ja"), status);
584 if (U_FAILURE(status))
585 {
586 errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseWordBreak.\n");
587 return;
588 }
589
590 generalIteratorTest(*e, japaneseWordSelection);
591 delete e;
592}
593
594void RBBITest::TestTrieDict() {
595 UErrorCode status = U_ZERO_ERROR;
596
597 //
598 // Open and read the test data file.
599 //
600 const char *testDataDirectory = IntlTest::getSourceTestData(status);
601 char testFileName[1000];
602 if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen("riwords.txt") + 10 >= sizeof(testFileName)) {
603 errln("Can't open test data. Path too long.");
604 return;
605 }
606 strcpy(testFileName, testDataDirectory);
607 strcat(testFileName, "riwords.txt");
608
609 // Items needing deleting at the end
610 MutableTrieDictionary *mutableDict = NULL;
611 CompactTrieDictionary *compactDict = NULL;
612 UnicodeSet *breaks = NULL;
613 UChar *testFile = NULL;
46f4442e
A
614 StringEnumeration *enumer1 = NULL;
615 StringEnumeration *enumer2 = NULL;
73c04bcf
A
616 MutableTrieDictionary *mutable2 = NULL;
617 StringEnumeration *cloneEnum = NULL;
618 CompactTrieDictionary *compact2 = NULL;
619
46f4442e 620
73c04bcf
A
621 const UnicodeString *originalWord = NULL;
622 const UnicodeString *cloneWord = NULL;
623 UChar *current;
624 UChar *word;
625 UChar uc;
626 int32_t wordLen;
627 int32_t wordCount;
628 int32_t testCount;
46f4442e 629
73c04bcf 630 int len;
46f4442e 631 testFile = ReadAndConvertFile(testFileName, len, NULL, status);
73c04bcf
A
632 if (U_FAILURE(status)) {
633 goto cleanup; /* something went wrong, error already output */
634 }
635
636 mutableDict = new MutableTrieDictionary(0x0E1C, status);
637 if (U_FAILURE(status)) {
638 errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status));
639 goto cleanup;
640 }
46f4442e 641
73c04bcf
A
642 breaks = new UnicodeSet;
643 breaks->add(0x000A); // Line Feed
644 breaks->add(0x000D); // Carriage Return
645 breaks->add(0x2028); // Line Separator
646 breaks->add(0x2029); // Paragraph Separator
647
648 // Now add each non-comment line of the file as a word.
649 current = testFile;
650 word = current;
651 uc = *current++;
652 wordLen = 0;
653 wordCount = 0;
46f4442e 654
73c04bcf
A
655 while (uc) {
656 if (uc == 0x0023) { // #comment line, skip
657 while (uc && !breaks->contains(uc)) {
658 uc = *current++;
659 }
660 }
661 else while (uc && !breaks->contains(uc)) {
662 ++wordLen;
663 uc = *current++;
664 }
665 if (wordLen > 0) {
666 mutableDict->addWord(word, wordLen, status);
667 if (U_FAILURE(status)) {
668 errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status));
669 goto cleanup;
670 }
671 wordCount += 1;
672 }
46f4442e 673
73c04bcf
A
674 // Find beginning of next line
675 while (uc && breaks->contains(uc)) {
676 uc = *current++;
677 }
678 word = current-1;
679 wordLen = 0;
680 }
46f4442e 681
73c04bcf
A
682 if (wordCount < 50) {
683 errln("Word count (%d) unreasonably small\n", wordCount);
684 goto cleanup;
685 }
686
46f4442e 687 enumer1 = mutableDict->openWords(status);
73c04bcf
A
688 if (U_FAILURE(status)) {
689 errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status));
690 goto cleanup;
691 }
692
693 testCount = 0;
46f4442e 694 if (wordCount != (testCount = enumer1->count(status))) {
73c04bcf
A
695 errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
696 testCount, wordCount, u_errorName(status));
697 goto cleanup;
698 }
46f4442e 699
73c04bcf
A
700 // Now compact it
701 compactDict = new CompactTrieDictionary(*mutableDict, status);
702 if (U_FAILURE(status)) {
703 errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status));
704 goto cleanup;
705 }
46f4442e
A
706
707 enumer2 = compactDict->openWords(status);
73c04bcf
A
708 if (U_FAILURE(status)) {
709 errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status));
710 goto cleanup;
711 }
46f4442e
A
712
713 if (wordCount != (testCount = enumer2->count(status))) {
73c04bcf
A
714 errln("CompactTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
715 testCount, wordCount, u_errorName(status));
716 goto cleanup;
717 }
46f4442e
A
718
719 if (enumer1->getDynamicClassID() == enumer2->getDynamicClassID()) {
720 errln("CompactTrieEnumeration and MutableTrieEnumeration ClassIDs are the same");
721 }
722 delete enumer1;
723 enumer1 = NULL;
724 delete enumer2;
725 enumer2 = NULL;
726
73c04bcf
A
727 // Now un-compact it
728 mutable2 = compactDict->cloneMutable(status);
729 if (U_FAILURE(status)) {
730 errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status));
731 goto cleanup;
732 }
46f4442e 733
73c04bcf
A
734 cloneEnum = mutable2->openWords(status);
735 if (U_FAILURE(status)) {
736 errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status));
737 goto cleanup;
738 }
46f4442e 739
73c04bcf
A
740 if (wordCount != (testCount = cloneEnum->count(status))) {
741 errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
742 testCount, wordCount, u_errorName(status));
743 goto cleanup;
744 }
46f4442e 745
73c04bcf
A
746 // Compact original dictionary to clone. Note that we can only compare the same kind of
747 // dictionary as the order of the enumerators is not guaranteed to be the same between
748 // different kinds
46f4442e 749 enumer1 = mutableDict->openWords(status);
73c04bcf
A
750 if (U_FAILURE(status)) {
751 errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status));
752 goto cleanup;
753 }
46f4442e
A
754
755 originalWord = enumer1->snext(status);
73c04bcf
A
756 cloneWord = cloneEnum->snext(status);
757 while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
758 if (*originalWord != *cloneWord) {
759 errln("Original and cloned MutableTrieDictionary word mismatch\n");
760 goto cleanup;
761 }
46f4442e 762 originalWord = enumer1->snext(status);
73c04bcf
A
763 cloneWord = cloneEnum->snext(status);
764 }
46f4442e 765
73c04bcf
A
766 if (U_FAILURE(status)) {
767 errln("Enumeration failed: %s\n", u_errorName(status));
768 goto cleanup;
769 }
46f4442e 770
73c04bcf
A
771 if (originalWord != cloneWord) {
772 errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n");
773 goto cleanup;
774 }
775
776 // Test the data copying constructor for CompactTrieDict, and the data access APIs.
777 compact2 = new CompactTrieDictionary(compactDict->data(), status);
778 if (U_FAILURE(status)) {
779 errln("CompactTrieDictionary(const void *,...) failed\n");
780 goto cleanup;
781 }
46f4442e 782
73c04bcf
A
783 if (compact2->dataSize() == 0) {
784 errln("CompactTrieDictionary->dataSize() == 0\n");
785 goto cleanup;
786 }
46f4442e 787
73c04bcf 788 // Now count the words via the second dictionary
46f4442e
A
789 delete enumer1;
790 enumer1 = compact2->openWords(status);
73c04bcf
A
791 if (U_FAILURE(status)) {
792 errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status));
793 goto cleanup;
794 }
46f4442e
A
795
796 if (wordCount != (testCount = enumer1->count(status))) {
73c04bcf
A
797 errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n",
798 testCount, wordCount, u_errorName(status));
799 goto cleanup;
800 }
46f4442e 801
73c04bcf
A
802cleanup:
803 delete compactDict;
804 delete mutableDict;
805 delete breaks;
806 delete[] testFile;
46f4442e 807 delete enumer1;
73c04bcf
A
808 delete mutable2;
809 delete cloneEnum;
810 delete compact2;
811}
812
73c04bcf
A
813
814//----------------------------------------------------------------------------
815//
816// generalIteratorTest Given a break iterator and a set of test data,
817// Run the tests and report the results.
818//
819//----------------------------------------------------------------------------
820void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
821{
822
823 bi.setText(td.fDataToBreak);
824
825 testFirstAndNext(bi, td);
826
827 testLastAndPrevious(bi, td);
828
829 testFollowing(bi, td);
830 testPreceding(bi, td);
831 testIsBoundary(bi, td);
832 doMultipleSelectionTest(bi, td);
833}
834
835
836//
837// testFirstAndNext. Run the iterator forwards in the obvious first(), next()
838// kind of loop.
839//
840void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
841{
842 UErrorCode status = U_ZERO_ERROR;
843 int32_t p;
844 int32_t lastP = -1;
845 int32_t tag;
846
847 logln("Test first and next");
848 bi.setText(td.fDataToBreak);
849 td.clearResults();
850
851 for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
852 td.fActualBreakPositions.addElement(p, status); // Save result.
853 tag = bi.getRuleStatus();
854 td.fActualTags.addElement(tag, status);
855 if (p <= lastP) {
856 // If the iterator is not making forward progress, stop.
857 // No need to raise an error here, it'll be detected in the normal check of results.
858 break;
859 }
860 lastP = p;
861 }
862 td.checkResults("testFirstAndNext", this);
863}
864
865
866//
867// TestLastAndPrevious. Run the iterator backwards, starting with last().
868//
869void RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td)
870{
871 UErrorCode status = U_ZERO_ERROR;
872 int32_t p;
873 int32_t lastP = 0x7ffffffe;
874 int32_t tag;
875
46f4442e 876 logln("Test last and previous");
73c04bcf
A
877 bi.setText(td.fDataToBreak);
878 td.clearResults();
879
880 for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
881 // Save break position. Insert it at start of vector of results, shoving
882 // already-saved results further towards the end.
883 td.fActualBreakPositions.insertElementAt(p, 0, status);
884 // bi.previous(); // TODO: Why does this fix things up????
885 // bi.next();
886 tag = bi.getRuleStatus();
887 td.fActualTags.insertElementAt(tag, 0, status);
888 if (p >= lastP) {
889 // If the iterator is not making progress, stop.
890 // No need to raise an error here, it'll be detected in the normal check of results.
891 break;
892 }
893 lastP = p;
894 }
895 td.checkResults("testLastAndPrevious", this);
896}
897
898
899void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
900{
901 UErrorCode status = U_ZERO_ERROR;
902 int32_t p;
903 int32_t tag;
904 int32_t lastP = -2; // A value that will never be returned as a break position.
905 // cannot be -1; that is returned for DONE.
906 int i;
907
908 logln("testFollowing():");
909 bi.setText(td.fDataToBreak);
910 td.clearResults();
911
912 // Save the starting point, since we won't get that out of following.
913 p = bi.first();
914 td.fActualBreakPositions.addElement(p, status); // Save result.
915 tag = bi.getRuleStatus();
916 td.fActualTags.addElement(tag, status);
917
918 for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
919 p = bi.following(i);
920 if (p != lastP) {
921 if (p == RuleBasedBreakIterator::DONE) {
922 break;
923 }
924 // We've reached a new break position. Save it.
925 td.fActualBreakPositions.addElement(p, status); // Save result.
926 tag = bi.getRuleStatus();
927 td.fActualTags.addElement(tag, status);
928 lastP = p;
929 }
930 }
931 // The loop normally exits by means of the break in the middle.
932 // Make sure that the index was at the correct position for the break iterator to have
933 // returned DONE.
934 if (i != td.fDataToBreak.length()) {
935 errln("testFollowing(): iterator returned DONE prematurely.");
936 }
937
938 // Full check of all results.
939 td.checkResults("testFollowing", this);
940}
941
942
943
944void RBBITest::testPreceding(RuleBasedBreakIterator& bi, BITestData &td) {
945 UErrorCode status = U_ZERO_ERROR;
946 int32_t p;
947 int32_t tag;
948 int32_t lastP = 0x7ffffffe;
949 int i;
950
951 logln("testPreceding():");
952 bi.setText(td.fDataToBreak);
953 td.clearResults();
954
955 p = bi.last();
956 td.fActualBreakPositions.addElement(p, status);
957 tag = bi.getRuleStatus();
958 td.fActualTags.addElement(tag, status);
959
960 for (i = td.fDataToBreak.length(); i>=-1; i--) {
961 p = bi.preceding(i);
962 if (p != lastP) {
963 if (p == RuleBasedBreakIterator::DONE) {
964 break;
965 }
966 // We've reached a new break position. Save it.
967 td.fActualBreakPositions.insertElementAt(p, 0, status);
968 lastP = p;
969 tag = bi.getRuleStatus();
970 td.fActualTags.insertElementAt(tag, 0, status);
971 }
972 }
973 // The loop normally exits by means of the break in the middle.
974 // Make sure that the index was at the correct position for the break iterator to have
975 // returned DONE.
976 if (i != 0) {
977 errln("testPreceding(): iterator returned DONE prematurely.");
978 }
979
980 // Full check of all results.
981 td.checkResults("testPreceding", this);
982}
983
984
985
986void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td) {
987 UErrorCode status = U_ZERO_ERROR;
988 int i;
989 int32_t tag;
990
991 logln("testIsBoundary():");
992 bi.setText(td.fDataToBreak);
993 td.clearResults();
994
995 for (i = 0; i <= td.fDataToBreak.length(); i++) {
996 if (bi.isBoundary(i)) {
997 td.fActualBreakPositions.addElement(i, status); // Save result.
998 tag = bi.getRuleStatus();
999 td.fActualTags.addElement(tag, status);
1000 }
1001 }
1002 td.checkResults("testIsBoundary: ", this);
1003}
1004
1005
1006
1007void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
1008{
1009 iterator.setText(td.fDataToBreak);
1010
1011 RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
1012 int32_t offset = iterator.first();
1013 int32_t testOffset;
1014 int32_t count = 0;
1015
1016 logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
1017
1018 if (*testIterator != iterator)
1019 errln("clone() or operator!= failed: two clones compared unequal");
1020
1021 do {
1022 testOffset = testIterator->first();
1023 testOffset = testIterator->next(count);
1024 if (offset != testOffset)
1025 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
1026
1027 if (offset != RuleBasedBreakIterator::DONE) {
1028 count++;
1029 offset = iterator.next();
1030
1031 if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
1032 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
1033 if (count > 10000 || offset == -1) {
1034 errln("operator== failed too many times. Stopping test.");
1035 if (offset == -1) {
1036 errln("Does (RuleBasedBreakIterator::DONE == -1)?");
1037 }
1038 return;
1039 }
1040 }
1041 }
1042 } while (offset != RuleBasedBreakIterator::DONE);
1043
1044 // now do it backwards...
1045 offset = iterator.last();
1046 count = 0;
1047
1048 do {
1049 testOffset = testIterator->last();
1050 testOffset = testIterator->next(count); // next() with a negative arg is same as previous
1051 if (offset != testOffset)
1052 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
1053
1054 if (offset != RuleBasedBreakIterator::DONE) {
1055 count--;
1056 offset = iterator.previous();
1057 }
1058 } while (offset != RuleBasedBreakIterator::DONE);
1059
1060 delete testIterator;
1061}
1062
1063
1064//---------------------------------------------
1065//
1066// other tests
1067//
1068//---------------------------------------------
1069void RBBITest::TestEmptyString()
1070{
1071 UnicodeString text = "";
1072 UErrorCode status = U_ZERO_ERROR;
1073
1074 BITestData x(status);
1075 ADD_DATACHUNK(x, "", 0, status); // Break at start of data
1076 RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
1077 if (U_FAILURE(status))
1078 {
1079 errln("Failed to create the BreakIterator for default locale in TestEmptyString.\n");
1080 return;
1081 }
1082 generalIteratorTest(*bi, x);
1083 delete bi;
1084}
1085
1086void RBBITest::TestGetAvailableLocales()
1087{
1088 int32_t locCount = 0;
1089 const Locale* locList = BreakIterator::getAvailableLocales(locCount);
1090
1091 if (locCount == 0)
1092 errln("getAvailableLocales() returned an empty list!");
1093 // Just make sure that it's returning good memory.
1094 int32_t i;
1095 for (i = 0; i < locCount; ++i) {
1096 logln(locList[i].getName());
1097 }
1098}
1099
1100//Testing the BreakIterator::getDisplayName() function
1101void RBBITest::TestGetDisplayName()
1102{
1103 UnicodeString result;
1104
1105 BreakIterator::getDisplayName(Locale::getUS(), result);
1106 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
1107 errln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
1108 + result);
1109
1110 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
1111 if (result != "French (France)")
1112 errln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
1113 + result);
1114}
1115/**
1116 * Test End Behaviour
1117 * @bug 4068137
1118 */
1119void RBBITest::TestEndBehaviour()
1120{
1121 UErrorCode status = U_ZERO_ERROR;
1122 UnicodeString testString("boo.");
1123 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
1124 if (U_FAILURE(status))
1125 {
1126 errln("Failed to create the BreakIterator for default locale in TestEndBehaviour.\n");
1127 return;
1128 }
1129 wb->setText(testString);
1130
1131 if (wb->first() != 0)
1132 errln("Didn't get break at beginning of string.");
1133 if (wb->next() != 3)
1134 errln("Didn't get break before period in \"boo.\"");
1135 if (wb->current() != 4 && wb->next() != 4)
1136 errln("Didn't get break at end of string.");
1137 delete wb;
1138}
1139/*
1140 * @bug 4153072
1141 */
1142void RBBITest::TestBug4153072() {
1143 UErrorCode status = U_ZERO_ERROR;
1144 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
1145 if (U_FAILURE(status))
1146 {
1147 errln("Failed to create the BreakIterator for default locale in TestBug4153072\n");
1148 return;
1149 }
1150 UnicodeString str("...Hello, World!...");
1151 int32_t begin = 3;
1152 int32_t end = str.length() - 3;
1153 UBool onBoundary;
1154
1155 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
1156 iter->adoptText(textIterator);
1157 int index;
1158 // Note: with the switch to UText, there is no way to restrict the
1159 // iteration range to begin at an index other than zero.
1160 // String character iterators created with a non-zero bound are
1161 // treated by RBBI as being empty.
1162 for (index = -1; index < begin + 1; ++index) {
1163 onBoundary = iter->isBoundary(index);
1164 if (index == 0? !onBoundary : onBoundary) {
1165 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
1166 " and begin index = " + begin);
1167 }
1168 }
1169 delete iter;
1170}
1171
1172
46f4442e
A
1173//
1174// Test for problem reported by Ashok Matoria on 9 July 2007
1175// One.<kSoftHyphen><kSpace>Two.
1176//
1177// Sentence break at start (0) and then on calling next() it breaks at
1178// 'T' of "Two". Now, at this point if I do next() and
1179// then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
1180//
1181void RBBITest::TestBug5775() {
1182 UErrorCode status = U_ZERO_ERROR;
1183 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1184 TEST_ASSERT_SUCCESS(status);
1185 TEST_ASSERT(bi != NULL);
1186
1187 if (U_FAILURE(status) || bi == NULL) {
1188 // TEST_ASSERT already printed error message.
1189 return;
1190 }
1191
1192 UnicodeString s("One.\\u00ad Two.", -1, US_INV);
1193 // 01234 56789
1194 s = s.unescape();
1195 bi->setText(s);
1196 int pos = bi->next();
1197 TEST_ASSERT(pos == 6);
1198 pos = bi->next();
1199 TEST_ASSERT(pos == 10);
1200 pos = bi->previous();
1201 TEST_ASSERT(pos == 6);
1202 delete bi;
1203}
1204
1205
1206
73c04bcf
A
1207/**
1208 * Test Japanese Line Break
1209 * @bug 4095322
1210 */
1211void RBBITest::TestJapaneseLineBreak()
1212{
1213#if 0
1214 // Test needs updating some more... Dump it for now.
1215
1216
1217 // Change for Unicode TR 14: Punctuation characters with categories Pi and Pf do not count
1218 // as opening and closing punctuation for line breaking.
1219 // Also, \u30fc and \u30fe are not counted as hyphens. Remove these chars
1220 // from these tests. 6-13-2002
1221 //
1222 UErrorCode status = U_ZERO_ERROR;
1223 UnicodeString testString = CharsToUnicodeString("\\u4e00x\\u4e8c");
1224 UnicodeString precedingChars = CharsToUnicodeString(
1225 //"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f");
1226 "([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e");
1227 UnicodeString followingChars = CharsToUnicodeString(
1228 // ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc"
1229 ")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7"
1230 // ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034"
1231 ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u2034"
1232 "\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302");
1233 BreakIterator *iter = BreakIterator::createLineInstance(Locale::getJapan(), status);
1234
1235 int32_t i;
1236 if (U_FAILURE(status))
1237 {
1238 errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n");
1239 return;
1240 }
1241
1242 for (i = 0; i < precedingChars.length(); i++) {
1243 testString.setCharAt(1, precedingChars[i]);
1244 iter->setText(testString);
1245 int32_t j = iter->first();
1246 if (j != 0)
1247 errln("ja line break failure: failed to start at 0");
1248 j = iter->next();
1249 if (j != 1)
1250 errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars[i])
1251 + "' (" + ((int)(precedingChars[i])) + ")");
1252 j = iter->next();
1253 if (j != 3)
1254 errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars[i])
1255 + "' (" + ((int)(precedingChars[i])) + ")");
1256 }
1257
1258 for (i = 0; i < followingChars.length(); i++) {
1259 testString.setCharAt(1, followingChars[i]);
1260 iter->setText(testString);
1261 int j = iter->first();
1262 if (j != 0)
1263 errln("ja line break failure: failed to start at 0");
1264 j = iter->next();
1265 if (j != 2)
1266 errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars[i])
1267 + "' (" + ((int)(followingChars[i])) + ")");
1268 j = iter->next();
1269 if (j != 3)
1270 errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars[i])
1271 + "' (" + ((int)(followingChars[i])) + ")");
1272 }
1273 delete iter;
1274#endif
1275}
1276
1277
1278//------------------------------------------------------------------------------
1279//
1280// RBBITest::Extended Run RBBI Tests from an external test data file
1281//
1282//------------------------------------------------------------------------------
1283
1284struct TestParams {
1285 BreakIterator *bi;
1286 UnicodeString dataToBreak;
1287 UVector32 *expectedBreaks;
1288 UVector32 *srcLine;
1289 UVector32 *srcCol;
1290};
1291
1292void RBBITest::executeTest(TestParams *t) {
1293 int32_t bp;
1294 int32_t prevBP;
1295 int32_t i;
1296
1297 if (t->bi == NULL) {
1298 return;
1299 }
1300
1301 t->bi->setText(t->dataToBreak);
1302 //
1303 // Run the iterator forward
1304 //
1305 prevBP = -1;
1306 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
1307 if (prevBP == bp) {
1308 // Fail for lack of forward progress.
1309 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
1310 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1311 break;
1312 }
1313
1314 // Check that there were we didn't miss an expected break between the last one
1315 // and this one.
1316 for (i=prevBP+1; i<bp; i++) {
1317 if (t->expectedBreaks->elementAti(i) != 0) {
1318 int expected[] = {0, i};
1319 printStringBreaks(t->dataToBreak, expected, 2);
1320 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1321 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1322 }
1323 }
1324
1325 // Check that the break we did find was expected
1326 if (t->expectedBreaks->elementAti(bp) == 0) {
1327 int expected[] = {0, bp};
1328 printStringBreaks(t->dataToBreak, expected, 2);
1329 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
1330 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1331 } else {
1332 // The break was expected.
1333 // Check that the {nnn} tag value is correct.
1334 int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
1335 if (expectedTagVal == -1) {
1336 expectedTagVal = 0;
1337 }
1338 int32_t line = t->srcLine->elementAti(bp);
1339 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
1340 if (rs != expectedTagVal) {
1341 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
1342 " Actual, Expected status = %4d, %4d",
1343 bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
1344 }
1345 }
1346
1347
1348 prevBP = bp;
1349 }
1350
1351 // Verify that there were no missed expected breaks after the last one found
1352 for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
1353 if (t->expectedBreaks->elementAti(i) != 0) {
1354 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1355 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1356 }
1357 }
1358
1359 //
1360 // Run the iterator backwards, verify that the same breaks are found.
1361 //
1362 prevBP = t->dataToBreak.length()+2; // start with a phony value for the last break pos seen.
1363 for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
1364 if (prevBP == bp) {
1365 // Fail for lack of progress.
1366 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
1367 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1368 break;
1369 }
1370
1371 // Check that there were we didn't miss an expected break between the last one
1372 // and this one. (UVector returns zeros for index out of bounds.)
1373 for (i=prevBP-1; i>bp; i--) {
1374 if (t->expectedBreaks->elementAti(i) != 0) {
1375 errln("Reverse Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1376 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1377 }
1378 }
1379
1380 // Check that the break we did find was expected
1381 if (t->expectedBreaks->elementAti(bp) == 0) {
1382 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
1383 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1384 } else {
1385 // The break was expected.
1386 // Check that the {nnn} tag value is correct.
1387 int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
1388 if (expectedTagVal == -1) {
1389 expectedTagVal = 0;
1390 }
1391 int line = t->srcLine->elementAti(bp);
1392 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
1393 if (rs != expectedTagVal) {
1394 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
1395 " Actual, Expected status = %4d, %4d",
1396 bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
1397 }
1398 }
1399
1400 prevBP = bp;
1401 }
1402
1403 // Verify that there were no missed breaks prior to the last one found
1404 for (i=prevBP-1; i>=0; i--) {
1405 if (t->expectedBreaks->elementAti(i) != 0) {
1406 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1407 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1408 }
1409 }
1410}
1411
1412
1413void RBBITest::TestExtended() {
1414#if !UCONFIG_NO_REGULAR_EXPRESSIONS
1415 UErrorCode status = U_ZERO_ERROR;
1416 Locale locale("");
1417
1418 UnicodeString rules;
1419 TestParams tp;
1420 tp.bi = NULL;
1421 tp.expectedBreaks = new UVector32(status);
1422 tp.srcLine = new UVector32(status);
1423 tp.srcCol = new UVector32(status);
1424
46f4442e 1425 RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status);
73c04bcf
A
1426 TEST_ASSERT_SUCCESS(status);
1427
1428
1429 //
1430 // Open and read the test data file.
1431 //
1432 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1433 char testFileName[1000];
1434 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1435 errln("Can't open test data. Path too long.");
1436 return;
1437 }
1438 strcpy(testFileName, testDataDirectory);
1439 strcat(testFileName, "rbbitst.txt");
1440
1441 int len;
46f4442e 1442 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
73c04bcf
A
1443 if (U_FAILURE(status)) {
1444 return; /* something went wrong, error already output */
1445 }
1446
1447
1448
46f4442e 1449
73c04bcf
A
1450 //
1451 // Put the test data into a UnicodeString
1452 //
1453 UnicodeString testString(FALSE, testFile, len);
1454
1455 enum EParseState{
1456 PARSE_COMMENT,
1457 PARSE_TAG,
1458 PARSE_DATA,
1459 PARSE_NUM
1460 }
1461 parseState = PARSE_TAG;
1462
1463 EParseState savedState = PARSE_TAG;
1464
1465 static const UChar CH_LF = 0x0a;
1466 static const UChar CH_CR = 0x0d;
1467 static const UChar CH_HASH = 0x23;
1468 /*static const UChar CH_PERIOD = 0x2e;*/
1469 static const UChar CH_LT = 0x3c;
1470 static const UChar CH_GT = 0x3e;
1471 static const UChar CH_BACKSLASH = 0x5c;
1472 static const UChar CH_BULLET = 0x2022;
1473
1474 int32_t lineNum = 1;
1475 int32_t colStart = 0;
1476 int32_t column = 0;
1477 int32_t charIdx = 0;
1478
1479 int32_t tagValue = 0; // The numeric value of a <nnn> tag.
1480
1481 for (charIdx = 0; charIdx < len; ) {
1482 status = U_ZERO_ERROR;
1483 UChar c = testString.charAt(charIdx);
1484 charIdx++;
1485 if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
1486 // treat CRLF as a unit
1487 c = CH_LF;
1488 charIdx++;
1489 }
1490 if (c == CH_LF || c == CH_CR) {
1491 lineNum++;
1492 colStart = charIdx;
1493 }
1494 column = charIdx - colStart + 1;
1495
1496 switch (parseState) {
1497 case PARSE_COMMENT:
1498 if (c == 0x0a || c == 0x0d) {
1499 parseState = savedState;
1500 }
1501 break;
1502
1503 case PARSE_TAG:
1504 {
1505 if (c == CH_HASH) {
1506 parseState = PARSE_COMMENT;
1507 savedState = PARSE_TAG;
1508 break;
1509 }
1510 if (u_isUWhiteSpace(c)) {
1511 break;
1512 }
1513 if (testString.compare(charIdx-1, 6, "<word>") == 0) {
1514 delete tp.bi;
1515 tp.bi = BreakIterator::createWordInstance(locale, status);
1516 charIdx += 5;
1517 break;
1518 }
1519 if (testString.compare(charIdx-1, 6, "<char>") == 0) {
1520 delete tp.bi;
1521 tp.bi = BreakIterator::createCharacterInstance(locale, status);
1522 charIdx += 5;
1523 break;
1524 }
1525 if (testString.compare(charIdx-1, 6, "<line>") == 0) {
1526 delete tp.bi;
1527 tp.bi = BreakIterator::createLineInstance(locale, status);
1528 charIdx += 5;
1529 break;
1530 }
1531 if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
1532 delete tp.bi;
1533 tp.bi = NULL;
46f4442e 1534 tp.bi = BreakIterator::createSentenceInstance(locale, status);
73c04bcf
A
1535 charIdx += 5;
1536 break;
1537 }
1538 if (testString.compare(charIdx-1, 7, "<title>") == 0) {
1539 delete tp.bi;
1540 tp.bi = BreakIterator::createTitleInstance(locale, status);
1541 charIdx += 6;
1542 break;
1543 }
46f4442e 1544
73c04bcf
A
1545 // <locale loc_name>
1546 localeMatcher.reset(testString);
1547 if (localeMatcher.lookingAt(charIdx-1, status)) {
1548 UnicodeString localeName = localeMatcher.group(1, status);
1549 char localeName8[100];
1550 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
1551 locale = Locale::createFromName(localeName8);
1552 charIdx += localeMatcher.group(0, status).length();
1553 TEST_ASSERT_SUCCESS(status);
1554 break;
1555 }
1556 if (testString.compare(charIdx-1, 6, "<data>") == 0) {
1557 parseState = PARSE_DATA;
1558 charIdx += 5;
1559 tp.dataToBreak = "";
1560 tp.expectedBreaks->removeAllElements();
1561 tp.srcCol ->removeAllElements();
1562 tp.srcLine->removeAllElements();
1563 break;
1564 }
1565
1566 errln("line %d: Tag expected in test file.", lineNum);
73c04bcf
A
1567 parseState = PARSE_COMMENT;
1568 savedState = PARSE_DATA;
46f4442e 1569 goto end_test; // Stop the test.
73c04bcf
A
1570 }
1571 break;
1572
1573 case PARSE_DATA:
1574 if (c == CH_BULLET) {
1575 int32_t breakIdx = tp.dataToBreak.length();
1576 tp.expectedBreaks->setSize(breakIdx+1);
1577 tp.expectedBreaks->setElementAt(-1, breakIdx);
1578 tp.srcLine->setSize(breakIdx+1);
1579 tp.srcLine->setElementAt(lineNum, breakIdx);
1580 tp.srcCol ->setSize(breakIdx+1);
1581 tp.srcCol ->setElementAt(column, breakIdx);
1582 break;
1583 }
1584
1585 if (testString.compare(charIdx-1, 7, "</data>") == 0) {
1586 // Add final entry to mappings from break location to source file position.
1587 // Need one extra because last break position returned is after the
1588 // last char in the data, not at the last char.
1589 tp.srcLine->addElement(lineNum, status);
1590 tp.srcCol ->addElement(column, status);
1591
1592 parseState = PARSE_TAG;
1593 charIdx += 6;
1594
1595 // RUN THE TEST!
1596 executeTest(&tp);
1597 break;
1598 }
1599
46f4442e 1600 if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
73c04bcf
A
1601 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1602 // Get the code point from the name and insert it into the test data.
1603 // (Damn, no API takes names in Unicode !!!
1604 // we've got to take it back to char *)
1605 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
1606 int32_t nameLength = nameEndIdx - (charIdx+2);
1607 char charNameBuf[200];
1608 UChar32 theChar = -1;
1609 if (nameEndIdx != -1) {
1610 UErrorCode status = U_ZERO_ERROR;
1611 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1612 charNameBuf[sizeof(charNameBuf)-1] = 0;
1613 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1614 if (U_FAILURE(status)) {
1615 theChar = -1;
1616 }
1617 }
1618 if (theChar == -1) {
1619 errln("Error in named character in test file at line %d, col %d",
1620 lineNum, column);
1621 } else {
1622 // Named code point was recognized. Insert it
1623 // into the test data.
1624 tp.dataToBreak.append(theChar);
1625 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1626 tp.srcLine->addElement(lineNum, status);
1627 tp.srcCol ->addElement(column, status);
1628 }
1629 }
1630 if (nameEndIdx > charIdx) {
1631 charIdx = nameEndIdx+1;
1632
1633 }
1634 break;
1635 }
1636
1637
1638
1639
1640 if (testString.compare(charIdx-1, 2, "<>") == 0) {
1641 charIdx++;
1642 int32_t breakIdx = tp.dataToBreak.length();
1643 tp.expectedBreaks->setSize(breakIdx+1);
1644 tp.expectedBreaks->setElementAt(-1, breakIdx);
1645 tp.srcLine->setSize(breakIdx+1);
1646 tp.srcLine->setElementAt(lineNum, breakIdx);
1647 tp.srcCol ->setSize(breakIdx+1);
1648 tp.srcCol ->setElementAt(column, breakIdx);
1649 break;
1650 }
1651
1652 if (c == CH_LT) {
1653 tagValue = 0;
1654 parseState = PARSE_NUM;
1655 break;
1656 }
1657
1658 if (c == CH_HASH && column==3) { // TODO: why is column off so far?
1659 parseState = PARSE_COMMENT;
1660 savedState = PARSE_DATA;
1661 break;
1662 }
1663
1664 if (c == CH_BACKSLASH) {
1665 // Check for \ at end of line, a line continuation.
1666 // Advance over (discard) the newline
1667 UChar32 cp = testString.char32At(charIdx);
1668 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
1669 // We have a CR LF
1670 // Need an extra increment of the input ptr to move over both of them
1671 charIdx++;
1672 }
1673 if (cp == CH_LF || cp == CH_CR) {
1674 lineNum++;
1675 colStart = charIdx;
1676 charIdx++;
1677 break;
1678 }
1679
1680 // Let unescape handle the back slash.
1681 cp = testString.unescapeAt(charIdx);
1682 if (cp != -1) {
1683 // Escape sequence was recognized. Insert the char
1684 // into the test data.
1685 tp.dataToBreak.append(cp);
1686 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1687 tp.srcLine->addElement(lineNum, status);
1688 tp.srcCol ->addElement(column, status);
1689 }
1690 break;
1691 }
1692
1693
1694 // Not a recognized backslash escape sequence.
1695 // Take the next char as a literal.
1696 // TODO: Should this be an error?
1697 c = testString.charAt(charIdx);
1698 charIdx = testString.moveIndex32(charIdx, 1);
1699 }
1700
1701 // Normal, non-escaped data char.
1702 tp.dataToBreak.append(c);
1703
1704 // Save the mapping from offset in the data to line/column numbers in
1705 // the original input file. Will be used for better error messages only.
1706 // If there's an expected break before this char, the slot in the mapping
1707 // vector will already be set for this char; don't overwrite it.
1708 if (tp.dataToBreak.length() > tp.srcLine->size()) {
1709 tp.srcLine->addElement(lineNum, status);
1710 tp.srcCol ->addElement(column, status);
1711 }
1712 break;
1713
1714
1715 case PARSE_NUM:
1716 // We are parsing an expected numeric tag value, like <1234>,
1717 // within a chunk of data.
1718 if (u_isUWhiteSpace(c)) {
1719 break;
1720 }
1721
1722 if (c == CH_GT) {
1723 // Finished the number. Add the info to the expected break data,
1724 // and switch parse state back to doing plain data.
1725 parseState = PARSE_DATA;
1726 if (tagValue == 0) {
1727 tagValue = -1;
1728 }
1729 int32_t breakIdx = tp.dataToBreak.length();
1730 tp.expectedBreaks->setSize(breakIdx+1);
1731 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1732 tp.srcLine->setSize(breakIdx+1);
1733 tp.srcLine->setElementAt(lineNum, breakIdx);
1734 tp.srcCol ->setSize(breakIdx+1);
1735 tp.srcCol ->setElementAt(column, breakIdx);
1736 break;
1737 }
1738
1739 if (u_isdigit(c)) {
1740 tagValue = tagValue*10 + u_charDigitValue(c);
1741 break;
1742 }
1743
1744 errln("Syntax Error in test file at line %d, col %d",
1745 lineNum, column);
73c04bcf 1746 parseState = PARSE_COMMENT;
46f4442e 1747 goto end_test; // Stop the test
73c04bcf
A
1748 break;
1749 }
1750
1751
1752 if (U_FAILURE(status)) {
1753 errln("ICU Error %s while parsing test file at line %d.",
1754 u_errorName(status), lineNum);
73c04bcf 1755 status = U_ZERO_ERROR;
46f4442e 1756 goto end_test; // Stop the test
73c04bcf
A
1757 }
1758
1759 }
1760
1761end_test:
1762 delete tp.bi;
1763 delete tp.expectedBreaks;
1764 delete tp.srcLine;
1765 delete tp.srcCol;
1766 delete [] testFile;
1767#endif
1768}
1769
46f4442e
A
1770void RBBITest::TestThaiBreaks() {
1771 UErrorCode status=U_ZERO_ERROR;
1772 BreakIterator* b;
1773 Locale locale = Locale("th");
1774 int32_t p, index;
1775 UChar c[]= {
1776 0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B,
1777 0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19,
1778 0x0E16, 0x0E49, 0x0E33
1779 };
1780 int32_t expectedWordResult[] = {
1781 2, 3, 6, 10, 11, 15, 17, 20, 22
1782 };
1783 int32_t expectedLineResult[] = {
1784 3, 6, 11, 15, 17, 20, 22
1785 };
1786 int32_t size = sizeof(c)/sizeof(UChar);
1787 UnicodeString text=UnicodeString(c);
1788
1789 b = BreakIterator::createWordInstance(locale, status);
1790 if (U_FAILURE(status)) {
1791 errln("Unable to create thai word break iterator.\n");
1792 return;
1793 }
1794 b->setText(text);
1795 p = index = 0;
1796 while ((p=b->next())!=BreakIterator::DONE && p < size) {
1797 if (p != expectedWordResult[index++]) {
1798 errln("Incorrect break given by thai word break iterator. Expected: %d Got: %d", expectedWordResult[index-1], p);
1799 }
1800 }
1801 delete b;
1802
1803 b = BreakIterator::createLineInstance(locale, status);
1804 if (U_FAILURE(status)) {
1805 printf("Unable to create thai line break iterator.\n");
1806 return;
1807 }
1808 b->setText(text);
1809 p = index = 0;
1810 while ((p=b->next())!=BreakIterator::DONE && p < size) {
1811 if (p != expectedLineResult[index++]) {
1812 errln("Incorrect break given by thai line break iterator. Expected: %d Got: %d", expectedLineResult[index-1], p);
1813 }
1814 }
1815
1816 delete b;
1817}
1818
73c04bcf
A
1819
1820//-------------------------------------------------------------------------------
1821//
1822// ReadAndConvertFile Read a text data file, convert it to UChars, and
1823// return the datain one big UChar * buffer, which the caller must delete.
1824//
46f4442e
A
1825// parameters:
1826// fileName: the name of the file, with no directory part. The test data directory
1827// is assumed.
1828// ulen an out parameter, receives the actual length (in UChars) of the file data.
1829// encoding The file encoding. If the file contains a BOM, that will override the encoding
1830// specified here. The BOM, if it exists, will be stripped from the returned data.
1831// Pass NULL for the system default encoding.
1832// status
1833// returns:
1834// The file data, converted to UChar.
1835// The caller must delete this when done with
1836// delete [] theBuffer;
1837//
73c04bcf
A
1838// TODO: This is a clone of RegexTest::ReadAndConvertFile.
1839// Move this function to some common place.
1840//
1841//--------------------------------------------------------------------------------
46f4442e 1842UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
73c04bcf
A
1843 UChar *retPtr = NULL;
1844 char *fileBuf = NULL;
1845 UConverter* conv = NULL;
1846 FILE *f = NULL;
1847
1848 ulen = 0;
1849 if (U_FAILURE(status)) {
1850 return retPtr;
1851 }
1852
1853 //
1854 // Open the file.
1855 //
1856 f = fopen(fileName, "rb");
1857 if (f == 0) {
46f4442e 1858 dataerrln("[DATA] Error opening test data file %s\n", fileName);
73c04bcf
A
1859 status = U_FILE_ACCESS_ERROR;
1860 return NULL;
1861 }
1862 //
1863 // Read it in
1864 //
1865 int fileSize;
1866 int amt_read;
1867
1868 fseek( f, 0, SEEK_END);
1869 fileSize = ftell(f);
1870 fileBuf = new char[fileSize];
1871 fseek(f, 0, SEEK_SET);
1872 amt_read = fread(fileBuf, 1, fileSize, f);
1873 if (amt_read != fileSize || fileSize <= 0) {
1874 errln("Error reading test data file.");
1875 goto cleanUpAndReturn;
1876 }
1877
1878 //
1879 // Look for a Unicode Signature (BOM) on the data just read
1880 //
1881 int32_t signatureLength;
1882 const char * fileBufC;
46f4442e 1883 const char* bomEncoding;
73c04bcf
A
1884
1885 fileBufC = fileBuf;
46f4442e 1886 bomEncoding = ucnv_detectUnicodeSignature(
73c04bcf 1887 fileBuf, fileSize, &signatureLength, &status);
46f4442e 1888 if(bomEncoding!=NULL ){
73c04bcf
A
1889 fileBufC += signatureLength;
1890 fileSize -= signatureLength;
46f4442e 1891 encoding = bomEncoding;
73c04bcf
A
1892 }
1893
1894 //
1895 // Open a converter to take the rule file to UTF-16
1896 //
1897 conv = ucnv_open(encoding, &status);
1898 if (U_FAILURE(status)) {
1899 goto cleanUpAndReturn;
1900 }
1901
1902 //
1903 // Convert the rules to UChar.
1904 // Preflight first to determine required buffer size.
1905 //
1906 ulen = ucnv_toUChars(conv,
1907 NULL, // dest,
1908 0, // destCapacity,
1909 fileBufC,
1910 fileSize,
1911 &status);
1912 if (status == U_BUFFER_OVERFLOW_ERROR) {
1913 // Buffer Overflow is expected from the preflight operation.
1914 status = U_ZERO_ERROR;
1915
1916 retPtr = new UChar[ulen+1];
1917 ucnv_toUChars(conv,
1918 retPtr, // dest,
1919 ulen+1,
1920 fileBufC,
1921 fileSize,
1922 &status);
1923 }
1924
1925cleanUpAndReturn:
1926 fclose(f);
1927 delete []fileBuf;
1928 ucnv_close(conv);
1929 if (U_FAILURE(status)) {
1930 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1931 delete retPtr;
1932 retPtr = 0;
1933 ulen = 0;
1934 };
1935 return retPtr;
1936}
1937
1938
73c04bcf 1939
46f4442e 1940//--------------------------------------------------------------------------------------------
73c04bcf 1941//
46f4442e 1942// Run tests from each of the boundary test data files distributed by the Unicode Consortium
73c04bcf 1943//
46f4442e
A
1944//-------------------------------------------------------------------------------------------
1945void RBBITest::TestUnicodeFiles() {
1946 RuleBasedBreakIterator *bi;
1947 UErrorCode status = U_ZERO_ERROR;
73c04bcf 1948
46f4442e
A
1949 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getDefault(), status);
1950 TEST_ASSERT_SUCCESS(status);
1951 if (U_SUCCESS(status)) {
1952 runUnicodeTestData("GraphemeBreakTest.txt", bi);
1953 }
1954 delete bi;
73c04bcf 1955
46f4442e
A
1956 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getDefault(), status);
1957 TEST_ASSERT_SUCCESS(status);
1958 if (U_SUCCESS(status)) {
1959 runUnicodeTestData("WordBreakTest.txt", bi);
1960 }
1961 delete bi;
73c04bcf 1962
46f4442e
A
1963 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
1964 TEST_ASSERT_SUCCESS(status);
1965 if (U_SUCCESS(status)) {
1966 runUnicodeTestData("SentenceBreakTest.txt", bi);
1967 }
1968 delete bi;
73c04bcf 1969
46f4442e
A
1970 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
1971 TEST_ASSERT_SUCCESS(status);
1972 if (U_SUCCESS(status)) {
1973 runUnicodeTestData("LineBreakTest.txt", bi);
73c04bcf 1974 }
46f4442e 1975 delete bi;
73c04bcf
A
1976}
1977
1978
46f4442e
A
1979//--------------------------------------------------------------------------------------------
1980//
1981// Run tests from one of the boundary test data files distributed by the Unicode Consortium
1982//
1983//-------------------------------------------------------------------------------------------
1984void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1985#if !UCONFIG_NO_REGULAR_EXPRESSIONS
1986 UErrorCode status = U_ZERO_ERROR;
73c04bcf 1987
46f4442e
A
1988 //
1989 // Open and read the test data file, put it into a UnicodeString.
1990 //
1991 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1992 char testFileName[1000];
1993 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1994 dataerrln("[DATA] Can't open test data. Path too long.");
73c04bcf
A
1995 return;
1996 }
46f4442e
A
1997 strcpy(testFileName, testDataDirectory);
1998 strcat(testFileName, fileName);
1999
2000 logln("Opening data file %s\n", fileName);
73c04bcf 2001
46f4442e
A
2002 int len;
2003 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
2004 if (status != U_FILE_ACCESS_ERROR) {
2005 TEST_ASSERT_SUCCESS(status);
2006 TEST_ASSERT(testFile != NULL);
2007 }
2008 if (U_FAILURE(status) || testFile == NULL) {
2009 return; /* something went wrong, error already output */
2010 }
2011 UnicodeString testFileAsString(TRUE, testFile, len);
73c04bcf 2012
46f4442e
A
2013 //
2014 // Parse the test data file using a regular expression.
2015 // Each kind of token is recognized in its own capture group; what type of item was scanned
2016 // is identified by which group had a match.
2017 //
2018 // Caputure Group # 1 2 3 4 5
2019 // Parses this item: divide x hex digits comment \n unrecognized \n
2020 //
2021 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
2022 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
2023 UnicodeString testString;
2024 UVector32 breakPositions(status);
2025 int lineNumber = 1;
2026 TEST_ASSERT_SUCCESS(status);
2027 if (U_FAILURE(status)) {
73c04bcf
A
2028 return;
2029 }
2030
46f4442e
A
2031 //
2032 // Scan through each test case, building up the string to be broken in testString,
2033 // and the positions that should be boundaries in the breakPositions vector.
2034 //
2035 while (tokenMatcher.find()) {
2036 if (tokenMatcher.start(1, status) >= 0) {
2037 // Scanned a divide sign, indicating a break position in the test data.
2038 if (testString.length()>0) {
2039 breakPositions.addElement(testString.length(), status);
73c04bcf 2040 }
46f4442e
A
2041 }
2042 else if (tokenMatcher.start(2, status) >= 0) {
2043 // Scanned an 'x', meaning no break at this position in the test data
2044 // Nothing to be done here.
2045 }
2046 else if (tokenMatcher.start(3, status) >= 0) {
2047 // Scanned Hex digits. Convert them to binary, append to the character data string.
2048 const UnicodeString &hexNumber = tokenMatcher.group(3, status);
2049 int length = hexNumber.length();
2050 if (length<=8) {
2051 char buf[10];
2052 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
2053 UChar32 c = (UChar32)strtol(buf, NULL, 16);
2054 if (c<=0x10ffff) {
2055 testString.append(c);
2056 } else {
2057 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
2058 fileName, lineNumber);
2059 }
2060 } else {
2061 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
2062 fileName, lineNumber);
2063 }
2064 }
2065 else if (tokenMatcher.start(4, status) >= 0) {
2066 // Scanned to end of a line, possibly skipping over a comment in the process.
2067 // If the line from the file contained test data, run the test now.
2068 //
2069 if (testString.length() > 0) {
2070 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
73c04bcf
A
2071 }
2072
46f4442e
A
2073 // Clear out this test case.
2074 // The string and breakPositions vector will be refilled as the next
2075 // test case is parsed.
2076 testString.remove();
2077 breakPositions.removeAllElements();
2078 lineNumber++;
2079 } else {
2080 // Scanner catchall. Something unrecognized appeared on the line.
2081 char token[16];
2082 UnicodeString uToken = tokenMatcher.group(0, status);
2083 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
2084 token[sizeof(token)-1] = 0;
2085 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token);
2086
2087 // Clean up, in preparation for continuing with the next line.
2088 testString.remove();
2089 breakPositions.removeAllElements();
2090 lineNumber++;
2091 }
2092 TEST_ASSERT_SUCCESS(status);
2093 if (U_FAILURE(status)) {
73c04bcf
A
2094 break;
2095 }
46f4442e 2096 }
73c04bcf 2097
46f4442e
A
2098 delete [] testFile;
2099 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
2100}
73c04bcf 2101
46f4442e
A
2102//--------------------------------------------------------------------------------------------
2103//
2104// checkUnicodeTestCase() Run one test case from one of the Unicode Consortium
2105// test data files. Do only a simple, forward-only check -
2106// this test is mostly to check that ICU and the Unicode
2107// data agree with each other.
2108//
2109//--------------------------------------------------------------------------------------------
2110void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
2111 const UnicodeString &testString, // Text data to be broken
2112 UVector32 *breakPositions, // Positions where breaks should be found.
2113 RuleBasedBreakIterator *bi) {
2114 int32_t pos; // Break Position in the test string
2115 int32_t expectedI = 0; // Index of expected break position in the vector of expected results.
2116 int32_t expectedPos; // Expected break position (index into test string)
2117
2118 bi->setText(testString);
2119 pos = bi->first();
2120 pos = bi->next();
2121
2122 while (pos != BreakIterator::DONE) {
2123 if (expectedI >= breakPositions->size()) {
2124 errln("Test file \"%s\", line %d, unexpected break found at position %d",
2125 testFileName, lineNumber, pos);
2126 break;
73c04bcf 2127 }
46f4442e
A
2128 expectedPos = breakPositions->elementAti(expectedI);
2129 if (pos < expectedPos) {
2130 errln("Test file \"%s\", line %d, unexpected break found at position %d",
2131 testFileName, lineNumber, pos);
2132 break;
2133 }
2134 if (pos > expectedPos) {
2135 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
2136 testFileName, lineNumber, expectedPos);
73c04bcf
A
2137 break;
2138 }
46f4442e
A
2139 pos = bi->next();
2140 expectedI++;
2141 }
73c04bcf 2142
46f4442e
A
2143 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
2144 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
2145 testFileName, lineNumber, breakPositions->elementAti(expectedI));
73c04bcf 2146 }
46f4442e 2147}
73c04bcf 2148
73c04bcf 2149
73c04bcf
A
2150
2151#if !UCONFIG_NO_REGULAR_EXPRESSIONS
73c04bcf
A
2152//---------------------------------------------------------------------------------------
2153//
2154// classs RBBIMonkeyKind
2155//
2156// Monkey Test for Break Iteration
2157// Abstract interface class. Concrete derived classes independently
2158// implement the break rules for different iterator types.
2159//
2160// The Monkey Test itself uses doesn't know which type of break iterator it is
2161// testing, but works purely in terms of the interface defined here.
2162//
2163//---------------------------------------------------------------------------------------
2164class RBBIMonkeyKind {
2165public:
2166 // Return a UVector of UnicodeSets, representing the character classes used
2167 // for this type of iterator.
2168 virtual UVector *charClasses() = 0;
2169
2170 // Set the test text on which subsequent calls to next() will operate
2171 virtual void setText(const UnicodeString &s) = 0;
2172
2173 // Find the next break postion, starting from the prev break position, or from zero.
2174 // Return -1 after reaching end of string.
2175 virtual int32_t next(int32_t i) = 0;
2176
2177 virtual ~RBBIMonkeyKind();
2178 UErrorCode deferredStatus;
2179
2180
2181protected:
2182 RBBIMonkeyKind();
2183
2184private:
2185};
2186
2187RBBIMonkeyKind::RBBIMonkeyKind() {
2188 deferredStatus = U_ZERO_ERROR;
2189}
2190
2191RBBIMonkeyKind::~RBBIMonkeyKind() {
2192}
2193
2194
2195//----------------------------------------------------------------------------------------
2196//
2197// Random Numbers. Similar to standard lib rand() and srand()
2198// Not using library to
2199// 1. Get same results on all platforms.
2200// 2. Get access to current seed, to more easily reproduce failures.
2201//
2202//---------------------------------------------------------------------------------------
2203static uint32_t m_seed = 1;
2204
2205static uint32_t m_rand()
2206{
2207 m_seed = m_seed * 1103515245 + 12345;
2208 return (uint32_t)(m_seed/65536) % 32768;
2209}
2210
2211
2212//------------------------------------------------------------------------------------------
2213//
2214// class RBBICharMonkey Character (Grapheme Cluster) specific implementation
2215// of RBBIMonkeyKind.
2216//
2217//------------------------------------------------------------------------------------------
2218class RBBICharMonkey: public RBBIMonkeyKind {
2219public:
2220 RBBICharMonkey();
2221 virtual ~RBBICharMonkey();
2222 virtual UVector *charClasses();
2223 virtual void setText(const UnicodeString &s);
2224 virtual int32_t next(int32_t i);
2225private:
2226 UVector *fSets;
2227
2228 UnicodeSet *fCRLFSet;
2229 UnicodeSet *fControlSet;
2230 UnicodeSet *fExtendSet;
46f4442e
A
2231 UnicodeSet *fPrependSet;
2232 UnicodeSet *fSpacingSet;
2233 UnicodeSet *fLSet;
2234 UnicodeSet *fVSet;
2235 UnicodeSet *fTSet;
2236 UnicodeSet *fLVSet;
2237 UnicodeSet *fLVTSet;
73c04bcf
A
2238 UnicodeSet *fHangulSet;
2239 UnicodeSet *fAnySet;
2240
73c04bcf
A
2241 const UnicodeString *fText;
2242};
2243
2244
2245RBBICharMonkey::RBBICharMonkey() {
2246 UErrorCode status = U_ZERO_ERROR;
2247
2248 fText = NULL;
73c04bcf 2249
46f4442e
A
2250 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
2251 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status);
2252 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status);
2253 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
2254 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
2255 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
2256 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
2257 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
2258 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
2259 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
2260 fHangulSet = new UnicodeSet();
2261 fHangulSet->addAll(*fLSet);
2262 fHangulSet->addAll(*fVSet);
2263 fHangulSet->addAll(*fTSet);
2264 fHangulSet->addAll(*fLVSet);
2265 fHangulSet->addAll(*fLVTSet);
2266 fAnySet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u0000-\\U0010ffff]"), status);
2267
73c04bcf
A
2268 fSets = new UVector(status);
2269 fSets->addElement(fCRLFSet, status);
2270 fSets->addElement(fControlSet, status);
2271 fSets->addElement(fExtendSet, status);
46f4442e
A
2272 fSets->addElement(fPrependSet, status);
2273 fSets->addElement(fSpacingSet, status);
73c04bcf
A
2274 fSets->addElement(fHangulSet, status);
2275 fSets->addElement(fAnySet, status);
2276 if (U_FAILURE(status)) {
2277 deferredStatus = status;
2278 }
2279}
2280
2281
2282void RBBICharMonkey::setText(const UnicodeString &s) {
2283 fText = &s;
73c04bcf
A
2284}
2285
2286
73c04bcf 2287
46f4442e
A
2288int32_t RBBICharMonkey::next(int32_t prevPos) {
2289 int p0, p1, p2, p3; // Indices of the significant code points around the
2290 // break position being tested. The candidate break
2291 // location is before p2.
2292
2293 int breakPos = -1;
2294
2295 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2296
2297 if (U_FAILURE(deferredStatus)) {
2298 return -1;
73c04bcf 2299 }
46f4442e
A
2300
2301 // Previous break at end of string. return DONE.
2302 if (prevPos >= fText->length()) {
2303 return -1;
73c04bcf 2304 }
46f4442e
A
2305 p0 = p1 = p2 = p3 = prevPos;
2306 c3 = fText->char32At(prevPos);
2307 c0 = c1 = c2 = 0;
2308
2309 // Loop runs once per "significant" character position in the input text.
2310 for (;;) {
2311 // Move all of the positions forward in the input string.
2312 p0 = p1; c0 = c1;
2313 p1 = p2; c1 = c2;
2314 p2 = p3; c2 = c3;
2315
2316 // Advancd p3 by one codepoint
2317 p3 = fText->moveIndex32(p3, 1);
2318 c3 = fText->char32At(p3);
2319
2320 if (p1 == p2) {
2321 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2322 continue;
2323 }
2324 if (p2 == fText->length()) {
2325 // Reached end of string. Always a break position.
2326 break;
2327 }
2328
2329 // Rule GB3 CR x LF
2330 // No Extend or Format characters may appear between the CR and LF,
2331 // which requires the additional check for p2 immediately following p1.
2332 //
2333 if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
2334 continue;
2335 }
2336
2337 // Rule (GB4). ( Control | CR | LF ) <break>
2338 if (fControlSet->contains(c1) ||
2339 c1 == 0x0D ||
2340 c1 == 0x0A) {
2341 break;
2342 }
2343
2344 // Rule (GB5) <break> ( Control | CR | LF )
2345 //
2346 if (fControlSet->contains(c2) ||
2347 c2 == 0x0D ||
2348 c2 == 0x0A) {
2349 break;
2350 }
2351
2352
2353 // Rule (GB6) L x ( L | V | LV | LVT )
2354 if (fLSet->contains(c1) &&
2355 (fLSet->contains(c2) ||
2356 fVSet->contains(c2) ||
2357 fLVSet->contains(c2) ||
2358 fLVTSet->contains(c2))) {
2359 continue;
2360 }
2361
2362 // Rule (GB7) ( LV | V ) x ( V | T )
2363 if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
2364 (fVSet->contains(c2) || fTSet->contains(c2))) {
2365 continue;
2366 }
2367
2368 // Rule (GB8) ( LVT | T) x T
2369 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
2370 fTSet->contains(c2)) {
2371 continue;
2372 }
2373
2374 // Rule (GB9) Numeric x ALetter
2375 if (fExtendSet->contains(c2)) {
2376 continue;
2377 }
2378
2379 // Rule (GB9a) x SpacingMark
2380 if (fSpacingSet->contains(c2)) {
2381 continue;
2382 }
2383
2384 // Rule (GB9b) Prepend x
2385 if (fPrependSet->contains(c1)) {
2386 continue;
2387 }
2388
2389 // Rule (GB10) Any <break> Any
2390 break;
2391 }
2392
2393 breakPos = p2;
2394 return breakPos;
73c04bcf
A
2395}
2396
2397
46f4442e 2398
73c04bcf
A
2399UVector *RBBICharMonkey::charClasses() {
2400 return fSets;
2401}
2402
2403
2404RBBICharMonkey::~RBBICharMonkey() {
2405 delete fSets;
2406 delete fCRLFSet;
2407 delete fControlSet;
2408 delete fExtendSet;
46f4442e
A
2409 delete fPrependSet;
2410 delete fSpacingSet;
2411 delete fLSet;
2412 delete fVSet;
2413 delete fTSet;
2414 delete fLVSet;
2415 delete fLVTSet;
73c04bcf
A
2416 delete fHangulSet;
2417 delete fAnySet;
73c04bcf
A
2418}
2419
2420//------------------------------------------------------------------------------------------
2421//
2422// class RBBIWordMonkey Word Break specific implementation
2423// of RBBIMonkeyKind.
2424//
2425//------------------------------------------------------------------------------------------
2426class RBBIWordMonkey: public RBBIMonkeyKind {
2427public:
2428 RBBIWordMonkey();
2429 virtual ~RBBIWordMonkey();
2430 virtual UVector *charClasses();
2431 virtual void setText(const UnicodeString &s);
2432 virtual int32_t next(int32_t i);
2433private:
2434 UVector *fSets;
2435
46f4442e
A
2436 UnicodeSet *fCRSet;
2437 UnicodeSet *fLFSet;
2438 UnicodeSet *fNewlineSet;
73c04bcf
A
2439 UnicodeSet *fKatakanaSet;
2440 UnicodeSet *fALetterSet;
46f4442e 2441 UnicodeSet *fMidNumLetSet;
73c04bcf
A
2442 UnicodeSet *fMidLetterSet;
2443 UnicodeSet *fMidNumSet;
2444 UnicodeSet *fNumericSet;
2445 UnicodeSet *fFormatSet;
2446 UnicodeSet *fOtherSet;
2447 UnicodeSet *fExtendSet;
2448 UnicodeSet *fExtendNumLetSet;
2449
2450 RegexMatcher *fMatcher;
2451
2452 const UnicodeString *fText;
2453};
2454
2455
46f4442e 2456RBBIWordMonkey::RBBIWordMonkey()
73c04bcf
A
2457{
2458 UErrorCode status = U_ZERO_ERROR;
2459
73c04bcf
A
2460 fSets = new UVector(status);
2461
46f4442e
A
2462 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status);
2463 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status);
2464 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status);
2465 fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
2466 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status);
2467 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status);
2468 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status);
2469 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status);
2470 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status);
2471 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status);
2472 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
2473 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status);
2474
73c04bcf
A
2475 fOtherSet = new UnicodeSet();
2476 if(U_FAILURE(status)) {
2477 deferredStatus = status;
2478 return;
2479 }
2480
2481 fOtherSet->complement();
46f4442e
A
2482 fOtherSet->removeAll(*fCRSet);
2483 fOtherSet->removeAll(*fLFSet);
2484 fOtherSet->removeAll(*fNewlineSet);
73c04bcf
A
2485 fOtherSet->removeAll(*fKatakanaSet);
2486 fOtherSet->removeAll(*fALetterSet);
2487 fOtherSet->removeAll(*fMidLetterSet);
2488 fOtherSet->removeAll(*fMidNumSet);
2489 fOtherSet->removeAll(*fNumericSet);
2490 fOtherSet->removeAll(*fExtendNumLetSet);
2491 fOtherSet->removeAll(*fFormatSet);
2492 fOtherSet->removeAll(*fExtendSet);
46f4442e
A
2493 // Inhibit dictionary characters from being tested at all.
2494 fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
73c04bcf 2495
46f4442e
A
2496 fSets->addElement(fCRSet, status);
2497 fSets->addElement(fLFSet, status);
2498 fSets->addElement(fNewlineSet, status);
73c04bcf
A
2499 fSets->addElement(fALetterSet, status);
2500 fSets->addElement(fKatakanaSet, status);
2501 fSets->addElement(fMidLetterSet, status);
46f4442e 2502 fSets->addElement(fMidNumLetSet, status);
73c04bcf
A
2503 fSets->addElement(fMidNumSet, status);
2504 fSets->addElement(fNumericSet, status);
2505 fSets->addElement(fFormatSet, status);
2506 fSets->addElement(fExtendSet, status);
2507 fSets->addElement(fOtherSet, status);
2508 fSets->addElement(fExtendNumLetSet, status);
2509
73c04bcf
A
2510 if (U_FAILURE(status)) {
2511 deferredStatus = status;
2512 }
2513}
2514
2515void RBBIWordMonkey::setText(const UnicodeString &s) {
2516 fText = &s;
2517}
2518
2519
2520int32_t RBBIWordMonkey::next(int32_t prevPos) {
2521 int p0, p1, p2, p3; // Indices of the significant code points around the
2522 // break position being tested. The candidate break
2523 // location is before p2.
2524
2525 int breakPos = -1;
2526
2527 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
46f4442e
A
2528
2529 if (U_FAILURE(deferredStatus)) {
2530 return -1;
2531 }
73c04bcf
A
2532
2533 // Prev break at end of string. return DONE.
2534 if (prevPos >= fText->length()) {
2535 return -1;
2536 }
2537 p0 = p1 = p2 = p3 = prevPos;
2538 c3 = fText->char32At(prevPos);
2539 c0 = c1 = c2 = 0;
2540
2541 // Loop runs once per "significant" character position in the input text.
2542 for (;;) {
2543 // Move all of the positions forward in the input string.
2544 p0 = p1; c0 = c1;
2545 p1 = p2; c1 = c2;
2546 p2 = p3; c2 = c3;
2547
2548 // Advancd p3 by X(Extend | Format)* Rule 4
46f4442e 2549 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
73c04bcf
A
2550 do {
2551 p3 = fText->moveIndex32(p3, 1);
2552 c3 = fText->char32At(p3);
46f4442e
A
2553 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2554 break;
2555 };
73c04bcf
A
2556 }
2557 while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
2558
2559
2560 if (p1 == p2) {
2561 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2562 continue;
2563 }
2564 if (p2 == fText->length()) {
2565 // Reached end of string. Always a break position.
2566 break;
2567 }
46f4442e 2568
73c04bcf
A
2569 // Rule (3) CR x LF
2570 // No Extend or Format characters may appear between the CR and LF,
2571 // which requires the additional check for p2 immediately following p1.
2572 //
46f4442e 2573 if (c1==0x0D && c2==0x0A) {
73c04bcf
A
2574 continue;
2575 }
46f4442e
A
2576
2577 // Rule (3a) Break before and after newlines (including CR and LF)
2578 //
2579 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2580 break;
2581 };
2582 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2583 break;
2584 };
73c04bcf
A
2585
2586 // Rule (5). ALetter x ALetter
2587 if (fALetterSet->contains(c1) &&
2588 fALetterSet->contains(c2)) {
2589 continue;
2590 }
2591
2592 // Rule (6) ALetter x (MidLetter | MidNumLet) ALetter
2593 //
73c04bcf 2594 if ( fALetterSet->contains(c1) &&
46f4442e 2595 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2)) &&
73c04bcf
A
2596 fALetterSet->contains(c3)) {
2597 continue;
2598 }
2599
2600
2601 // Rule (7) ALetter (MidLetter | MidNumLet) x ALetter
2602 if (fALetterSet->contains(c0) &&
46f4442e 2603 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1)) &&
73c04bcf
A
2604 fALetterSet->contains(c2)) {
2605 continue;
2606 }
2607
2608 // Rule (8) Numeric x Numeric
2609 if (fNumericSet->contains(c1) &&
2610 fNumericSet->contains(c2)) {
2611 continue;
2612 }
2613
2614 // Rule (9) ALetter x Numeric
2615 if (fALetterSet->contains(c1) &&
2616 fNumericSet->contains(c2)) {
2617 continue;
2618 }
2619
2620 // Rule (10) Numeric x ALetter
2621 if (fNumericSet->contains(c1) &&
2622 fALetterSet->contains(c2)) {
2623 continue;
2624 }
2625
2626 // Rule (11) Numeric (MidNum | MidNumLet) x Numeric
46f4442e
A
2627 if (fNumericSet->contains(c0) &&
2628 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1)) &&
73c04bcf
A
2629 fNumericSet->contains(c2)) {
2630 continue;
2631 }
2632
2633 // Rule (12) Numeric x (MidNum | MidNumLet) Numeric
2634 if (fNumericSet->contains(c1) &&
46f4442e 2635 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2)) &&
73c04bcf
A
2636 fNumericSet->contains(c3)) {
2637 continue;
2638 }
2639
2640 // Rule (13) Katakana x Katakana
2641 if (fKatakanaSet->contains(c1) &&
2642 fKatakanaSet->contains(c2)) {
2643 continue;
2644 }
2645
2646 // Rule 13a
2647 if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) ||
2648 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2649 fExtendNumLetSet->contains(c2)) {
2650 continue;
2651 }
2652
2653 // Rule 13b
2654 if (fExtendNumLetSet->contains(c1) &&
2655 (fALetterSet->contains(c2) || fNumericSet->contains(c2) ||
2656 fKatakanaSet->contains(c2))) {
2657 continue;
2658 }
2659
2660 // Rule 14. Break found here.
2661 break;
2662 }
2663
2664 breakPos = p2;
2665 return breakPos;
2666}
2667
2668
2669UVector *RBBIWordMonkey::charClasses() {
2670 return fSets;
2671}
2672
2673
2674RBBIWordMonkey::~RBBIWordMonkey() {
2675 delete fSets;
46f4442e
A
2676 delete fCRSet;
2677 delete fLFSet;
2678 delete fNewlineSet;
73c04bcf
A
2679 delete fKatakanaSet;
2680 delete fALetterSet;
46f4442e 2681 delete fMidNumLetSet;
73c04bcf
A
2682 delete fMidLetterSet;
2683 delete fMidNumSet;
2684 delete fNumericSet;
2685 delete fFormatSet;
2686 delete fExtendSet;
2687 delete fExtendNumLetSet;
2688 delete fOtherSet;
2689}
2690
2691
2692
2693
2694//------------------------------------------------------------------------------------------
2695//
2696// class RBBISentMonkey Sentence Break specific implementation
2697// of RBBIMonkeyKind.
2698//
2699//------------------------------------------------------------------------------------------
2700class RBBISentMonkey: public RBBIMonkeyKind {
2701public:
2702 RBBISentMonkey();
2703 virtual ~RBBISentMonkey();
2704 virtual UVector *charClasses();
2705 virtual void setText(const UnicodeString &s);
2706 virtual int32_t next(int32_t i);
2707private:
2708 int moveBack(int posFrom);
2709 int moveForward(int posFrom);
2710 UChar32 cAt(int pos);
2711
2712 UVector *fSets;
2713
2714 UnicodeSet *fSepSet;
2715 UnicodeSet *fFormatSet;
2716 UnicodeSet *fSpSet;
2717 UnicodeSet *fLowerSet;
2718 UnicodeSet *fUpperSet;
2719 UnicodeSet *fOLetterSet;
2720 UnicodeSet *fNumericSet;
2721 UnicodeSet *fATermSet;
46f4442e 2722 UnicodeSet *fSContinueSet;
73c04bcf
A
2723 UnicodeSet *fSTermSet;
2724 UnicodeSet *fCloseSet;
2725 UnicodeSet *fOtherSet;
2726 UnicodeSet *fExtendSet;
2727
2728 const UnicodeString *fText;
2729
2730};
2731
2732RBBISentMonkey::RBBISentMonkey()
2733{
2734 UErrorCode status = U_ZERO_ERROR;
2735
2736 fSets = new UVector(status);
2737
46f4442e
A
2738 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
2739 // set and made into character classes of their own. For the monkey impl,
2740 // they remain in SEP, since Sep always appears with CR and LF in the rules.
2741 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status);
2742 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status);
2743 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status);
2744 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status);
2745 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status);
2746 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status);
2747 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status);
2748 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status);
2749 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2750 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status);
2751 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status);
2752 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status);
73c04bcf
A
2753 fOtherSet = new UnicodeSet();
2754
2755 if(U_FAILURE(status)) {
2756 deferredStatus = status;
2757 return;
2758 }
2759
2760 fOtherSet->complement();
2761 fOtherSet->removeAll(*fSepSet);
2762 fOtherSet->removeAll(*fFormatSet);
2763 fOtherSet->removeAll(*fSpSet);
2764 fOtherSet->removeAll(*fLowerSet);
2765 fOtherSet->removeAll(*fUpperSet);
2766 fOtherSet->removeAll(*fOLetterSet);
2767 fOtherSet->removeAll(*fNumericSet);
2768 fOtherSet->removeAll(*fATermSet);
46f4442e 2769 fOtherSet->removeAll(*fSContinueSet);
73c04bcf
A
2770 fOtherSet->removeAll(*fSTermSet);
2771 fOtherSet->removeAll(*fCloseSet);
2772 fOtherSet->removeAll(*fExtendSet);
2773
46f4442e
A
2774 fSets->addElement(fSepSet, status);
2775 fSets->addElement(fFormatSet, status);
2776 fSets->addElement(fSpSet, status);
2777 fSets->addElement(fLowerSet, status);
2778 fSets->addElement(fUpperSet, status);
2779 fSets->addElement(fOLetterSet, status);
2780 fSets->addElement(fNumericSet, status);
2781 fSets->addElement(fATermSet, status);
2782 fSets->addElement(fSContinueSet, status);
2783 fSets->addElement(fSTermSet, status);
2784 fSets->addElement(fCloseSet, status);
2785 fSets->addElement(fOtherSet, status);
2786 fSets->addElement(fExtendSet, status);
73c04bcf
A
2787
2788 if (U_FAILURE(status)) {
2789 deferredStatus = status;
2790 }
2791}
2792
2793
2794
2795void RBBISentMonkey::setText(const UnicodeString &s) {
2796 fText = &s;
2797}
2798
2799UVector *RBBISentMonkey::charClasses() {
2800 return fSets;
2801}
2802
2803
2804// moveBack() Find the "significant" code point preceding the index i.
2805// Skips over ($Extend | $Format)* .
46f4442e 2806//
73c04bcf
A
2807int RBBISentMonkey::moveBack(int i) {
2808 if (i <= 0) {
2809 return -1;
2810 }
2811 UChar32 c;
2812 int32_t j = i;
2813 do {
2814 j = fText->moveIndex32(j, -1);
2815 c = fText->char32At(j);
2816 }
2817 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2818 return j;
2819
2820 }
2821
2822
2823int RBBISentMonkey::moveForward(int i) {
2824 if (i>=fText->length()) {
2825 return fText->length();
2826 }
2827 UChar32 c;
2828 int32_t j = i;
2829 do {
2830 j = fText->moveIndex32(j, 1);
2831 c = cAt(j);
2832 }
2833 while (fFormatSet->contains(c) || fExtendSet->contains(c));
2834 return j;
2835}
2836
2837UChar32 RBBISentMonkey::cAt(int pos) {
2838 if (pos<0 || pos>=fText->length()) {
2839 return -1;
2840 } else {
2841 return fText->char32At(pos);
2842 }
2843}
2844
2845int32_t RBBISentMonkey::next(int32_t prevPos) {
2846 int p0, p1, p2, p3; // Indices of the significant code points around the
2847 // break position being tested. The candidate break
2848 // location is before p2.
2849
2850 int breakPos = -1;
2851
2852 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2853 UChar32 c;
2854
46f4442e
A
2855 if (U_FAILURE(deferredStatus)) {
2856 return -1;
2857 }
2858
73c04bcf
A
2859 // Prev break at end of string. return DONE.
2860 if (prevPos >= fText->length()) {
2861 return -1;
2862 }
2863 p0 = p1 = p2 = p3 = prevPos;
2864 c3 = fText->char32At(prevPos);
2865 c0 = c1 = c2 = 0;
2866
2867 // Loop runs once per "significant" character position in the input text.
2868 for (;;) {
2869 // Move all of the positions forward in the input string.
2870 p0 = p1; c0 = c1;
2871 p1 = p2; c1 = c2;
2872 p2 = p3; c2 = c3;
46f4442e 2873
73c04bcf
A
2874 // Advancd p3 by X(Extend | Format)* Rule 4
2875 p3 = moveForward(p3);
2876 c3 = cAt(p3);
2877
2878 // Rule (3) CR x LF
2879 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2880 continue;
2881 }
46f4442e 2882
73c04bcf
A
2883 // Rule (4). Sep <break>
2884 if (fSepSet->contains(c1)) {
2885 p2 = p1+1; // Separators don't combine with Extend or Format.
2886 break;
2887 }
2888
2889 if (p2 >= fText->length()) {
2890 // Reached end of string. Always a break position.
2891 break;
2892 }
2893
2894 if (p2 == prevPos) {
2895 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2896 continue;
2897 }
46f4442e 2898
73c04bcf
A
2899 // Rule (6). ATerm x Numeric
2900 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) {
2901 continue;
2902 }
2903
2904 // Rule (7). Upper ATerm x Uppper
2905 if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2906 continue;
2907 }
2908
2909 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2910 // Note: STerm | ATerm are added to the negated part of the expression by a
2911 // note to the Unicode 5.0 documents.
2912 int p8 = p1;
2913 while (fSpSet->contains(cAt(p8))) {
2914 p8 = moveBack(p8);
2915 }
2916 while (fCloseSet->contains(cAt(p8))) {
2917 p8 = moveBack(p8);
2918 }
2919 if (fATermSet->contains(cAt(p8))) {
2920 p8=p2;
2921 for (;;) {
2922 c = cAt(p8);
2923 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2924 fLowerSet->contains(c) || fSepSet->contains(c) ||
2925 fATermSet->contains(c) || fSTermSet->contains(c)) {
2926 break;
2927 }
2928 p8 = moveForward(p8);
2929 }
2930 if (fLowerSet->contains(cAt(p8))) {
2931 continue;
2932 }
2933 }
46f4442e
A
2934
2935 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2936 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
73c04bcf
A
2937 p8 = p1;
2938 while (fSpSet->contains(cAt(p8))) {
2939 p8 = moveBack(p8);
2940 }
2941 while (fCloseSet->contains(cAt(p8))) {
2942 p8 = moveBack(p8);
2943 }
2944 c = cAt(p8);
2945 if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2946 continue;
2947 }
2948 }
2949
46f4442e 2950 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)
73c04bcf
A
2951 int p9 = p1;
2952 while (fCloseSet->contains(cAt(p9))) {
2953 p9 = moveBack(p9);
2954 }
2955 c = cAt(p9);
2956 if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2957 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2958 continue;
2959 }
2960 }
2961
46f4442e 2962 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)
73c04bcf
A
2963 int p10 = p1;
2964 while (fSpSet->contains(cAt(p10))) {
2965 p10 = moveBack(p10);
2966 }
2967 while (fCloseSet->contains(cAt(p10))) {
2968 p10 = moveBack(p10);
2969 }
2970 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2971 if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2972 continue;
2973 }
2974 }
2975
46f4442e 2976 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break>
73c04bcf 2977 int p11 = p1;
46f4442e
A
2978 if (fSepSet->contains(cAt(p11))) {
2979 p11 = moveBack(p11);
2980 }
73c04bcf
A
2981 while (fSpSet->contains(cAt(p11))) {
2982 p11 = moveBack(p11);
2983 }
2984 while (fCloseSet->contains(cAt(p11))) {
2985 p11 = moveBack(p11);
2986 }
2987 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2988 break;
2989 }
2990
2991 // Rule (12) Any x Any
2992 continue;
2993 }
2994 breakPos = p2;
2995 return breakPos;
2996}
2997
2998RBBISentMonkey::~RBBISentMonkey() {
2999 delete fSets;
3000 delete fSepSet;
3001 delete fFormatSet;
3002 delete fSpSet;
3003 delete fLowerSet;
3004 delete fUpperSet;
3005 delete fOLetterSet;
3006 delete fNumericSet;
3007 delete fATermSet;
46f4442e 3008 delete fSContinueSet;
73c04bcf
A
3009 delete fSTermSet;
3010 delete fCloseSet;
3011 delete fOtherSet;
3012 delete fExtendSet;
3013}
3014
3015
3016
3017//-------------------------------------------------------------------------------------------
3018//
3019// RBBILineMonkey
3020//
3021//-------------------------------------------------------------------------------------------
3022
3023class RBBILineMonkey: public RBBIMonkeyKind {
3024public:
3025 RBBILineMonkey();
3026 virtual ~RBBILineMonkey();
3027 virtual UVector *charClasses();
3028 virtual void setText(const UnicodeString &s);
3029 virtual int32_t next(int32_t i);
3030 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
3031private:
3032 UVector *fSets;
3033
3034 UnicodeSet *fBK;
3035 UnicodeSet *fCR;
3036 UnicodeSet *fLF;
3037 UnicodeSet *fCM;
3038 UnicodeSet *fNL;
3039 UnicodeSet *fSG;
3040 UnicodeSet *fWJ;
3041 UnicodeSet *fZW;
3042 UnicodeSet *fGL;
3043 UnicodeSet *fCB;
3044 UnicodeSet *fSP;
3045 UnicodeSet *fB2;
3046 UnicodeSet *fBA;
3047 UnicodeSet *fBB;
3048 UnicodeSet *fHY;
3049 UnicodeSet *fH2;
3050 UnicodeSet *fH3;
3051 UnicodeSet *fCL;
3052 UnicodeSet *fEX;
3053 UnicodeSet *fIN;
3054 UnicodeSet *fJL;
3055 UnicodeSet *fJV;
3056 UnicodeSet *fJT;
3057 UnicodeSet *fNS;
3058 UnicodeSet *fOP;
3059 UnicodeSet *fQU;
3060 UnicodeSet *fIS;
3061 UnicodeSet *fNU;
3062 UnicodeSet *fPO;
3063 UnicodeSet *fPR;
3064 UnicodeSet *fSY;
3065 UnicodeSet *fAI;
3066 UnicodeSet *fAL;
3067 UnicodeSet *fID;
3068 UnicodeSet *fSA;
3069 UnicodeSet *fXX;
3070
3071 BreakIterator *fCharBI;
3072
3073 const UnicodeString *fText;
3074 int32_t *fOrigPositions;
3075
3076 RegexMatcher *fNumberMatcher;
3077 RegexMatcher *fLB11Matcher;
3078};
3079
3080
3081RBBILineMonkey::RBBILineMonkey()
3082{
3083 UErrorCode status = U_ZERO_ERROR;
3084
3085 fSets = new UVector(status);
3086
46f4442e
A
3087 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
3088 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
3089 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
3090 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
3091 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
3092 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
3093 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
3094 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
3095 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
3096 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
3097 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
3098 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
3099 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
3100 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
3101 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
3102 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
3103 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
3104 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
3105 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
3106 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
3107 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
3108 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
3109 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
3110 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
3111 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
3112 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
3113 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
3114 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
3115 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
3116 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
3117 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
3118 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
3119 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
3120 fSA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status);
3121 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
3122 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
73c04bcf
A
3123
3124 if (U_FAILURE(status)) {
3125 deferredStatus = status;
3126 fCharBI = NULL;
3127 fNumberMatcher = NULL;
3128 return;
3129 }
3130
3131 fAL->addAll(*fXX); // Default behavior for XX is identical to AL
3132 fAL->addAll(*fAI); // Default behavior for AI is identical to AL
3133 fAL->addAll(*fSA); // Default behavior for SA is XX, which defaults to AL
3134 fAL->addAll(*fSG); // Default behavior for SG is identical to AL.
3135
3136 fSets->addElement(fBK, status);
3137 fSets->addElement(fCR, status);
3138 fSets->addElement(fLF, status);
3139 fSets->addElement(fCM, status);
3140 fSets->addElement(fNL, status);
3141 fSets->addElement(fWJ, status);
3142 fSets->addElement(fZW, status);
3143 fSets->addElement(fGL, status);
3144 fSets->addElement(fCB, status);
3145 fSets->addElement(fSP, status);
3146 fSets->addElement(fB2, status);
3147 fSets->addElement(fBA, status);
3148 fSets->addElement(fBB, status);
3149 fSets->addElement(fHY, status);
3150 fSets->addElement(fH2, status);
3151 fSets->addElement(fH3, status);
3152 fSets->addElement(fCL, status);
3153 fSets->addElement(fEX, status);
3154 fSets->addElement(fIN, status);
3155 fSets->addElement(fJL, status);
3156 fSets->addElement(fJT, status);
3157 fSets->addElement(fJV, status);
3158 fSets->addElement(fNS, status);
3159 fSets->addElement(fOP, status);
3160 fSets->addElement(fQU, status);
3161 fSets->addElement(fIS, status);
3162 fSets->addElement(fNU, status);
3163 fSets->addElement(fPO, status);
3164 fSets->addElement(fPR, status);
3165 fSets->addElement(fSY, status);
3166 fSets->addElement(fAI, status);
3167 fSets->addElement(fAL, status);
3168 fSets->addElement(fID, status);
3169 fSets->addElement(fWJ, status);
3170 fSets->addElement(fSA, status);
3171 fSets->addElement(fSG, status);
3172
46f4442e
A
3173 const char *rules =
3174 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
3175 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
3176 "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
3177 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
3178 "(\\p{Line_Break=CL}\\p{Line_Break=CM}*)?"
3179 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
3180
73c04bcf 3181 fNumberMatcher = new RegexMatcher(
46f4442e 3182 UnicodeString(rules, -1, US_INV), 0, status);
73c04bcf
A
3183
3184 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
3185
3186 if (U_FAILURE(status)) {
3187 deferredStatus = status;
3188 }
3189}
3190
3191
3192void RBBILineMonkey::setText(const UnicodeString &s) {
3193 fText = &s;
3194 fCharBI->setText(s);
3195 fNumberMatcher->reset(s);
3196}
3197
3198//
3199// rule9Adjust
3200// Line Break TR rules 9 and 10 implementation.
3201// This deals with combining marks and other sequences that
3202// that must be treated as if they were something other than what they actually are.
3203//
3204// This is factored out into a separate function because it must be applied twice for
3205// each potential break, once to the chars before the position being checked, then
3206// again to the text following the possible break.
3207//
3208void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
3209 if (pos == -1) {
3210 // Invalid initial position. Happens during the warmup iteration of the
3211 // main loop in next().
3212 return;
3213 }
3214
3215 int32_t nPos = *nextPos;
3216
3217 // LB 9 Keep combining sequences together.
3218 // advance over any CM class chars. Note that Line Break CM is different
3219 // from the normal Grapheme Extend property.
3220 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
3221 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
3222 for (;;) {
3223 *nextChar = fText->char32At(nPos);
3224 if (!fCM->contains(*nextChar)) {
3225 break;
3226 }
3227 nPos = fText->moveIndex32(nPos, 1);
3228 }
3229 }
3230
3231
3232 // LB 9 Treat X CM* as if it were x.
3233 // No explicit action required.
3234
3235 // LB 10 Treat any remaining combining mark as AL
3236 if (fCM->contains(*posChar)) {
3237 *posChar = 0x41; // thisChar = 'A';
3238 }
3239
3240 // Push the updated nextPos and nextChar back to our caller.
3241 // This only makes a difference if posChar got bigger by consuming a
3242 // combining sequence.
3243 *nextPos = nPos;
3244 *nextChar = fText->char32At(nPos);
3245}
3246
3247
3248
3249int32_t RBBILineMonkey::next(int32_t startPos) {
3250 UErrorCode status = U_ZERO_ERROR;
3251 int32_t pos; // Index of the char following a potential break position
3252 UChar32 thisChar; // Character at above position "pos"
3253
3254 int32_t prevPos; // Index of the char preceding a potential break position
3255 UChar32 prevChar; // Character at above position. Note that prevChar
3256 // and thisChar may not be adjacent because combining
3257 // characters between them will be ignored.
3258
3259 int32_t nextPos; // Index of the next character following pos.
3260 // Usually skips over combining marks.
3261 int32_t nextCPPos; // Index of the code point following "pos."
3262 // May point to a combining mark.
3263 int32_t tPos; // temp value.
3264 UChar32 c;
3265
46f4442e
A
3266 if (U_FAILURE(deferredStatus)) {
3267 return -1;
3268 }
3269
73c04bcf
A
3270 if (startPos >= fText->length()) {
3271 return -1;
3272 }
3273
3274
3275 // Initial values for loop. Loop will run the first time without finding breaks,
3276 // while the invalid values shift out and the "this" and
3277 // "prev" positions are filled in with good values.
3278 pos = prevPos = -1; // Invalid value, serves as flag for initial loop iteration.
3279 thisChar = prevChar = 0;
3280 nextPos = nextCPPos = startPos;
3281
3282
3283 // Loop runs once per position in the test text, until a break position
3284 // is found.
3285 for (;;) {
3286 prevPos = pos;
3287 prevChar = thisChar;
3288
3289 pos = nextPos;
3290 thisChar = fText->char32At(pos);
3291
3292 nextCPPos = fText->moveIndex32(pos, 1);
3293 nextPos = nextCPPos;
3294
3295 // Rule LB2 - Break at end of text.
3296 if (pos >= fText->length()) {
3297 break;
3298 }
3299
3300 // Rule LB 9 - adjust for combining sequences.
3301 // We do this one out-of-order because the adjustment does not change anything
3302 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3303 // be applied.
3304 rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
3305 nextCPPos = nextPos = fText->moveIndex32(pos, 1);
3306 c = fText->char32At(nextPos);
3307 rule9Adjust(pos, &thisChar, &nextPos, &c);
3308
3309 // If the loop is still warming up - if we haven't shifted the initial
3310 // -1 positions out of prevPos yet - loop back to advance the
3311 // position in the input without any further looking for breaks.
3312 if (prevPos == -1) {
3313 continue;
3314 }
46f4442e 3315
73c04bcf
A
3316 // LB 4 Always break after hard line breaks,
3317 if (fBK->contains(prevChar)) {
3318 break;
3319 }
3320
3321 // LB 5 Break after CR, LF, NL, but not inside CR LF
3322 if (prevChar == 0x0d && thisChar == 0x0a) {
3323 continue;
3324 }
3325 if (prevChar == 0x0d ||
3326 prevChar == 0x0a ||
3327 prevChar == 0x85) {
3328 break;
3329 }
3330
3331 // LB 6 Don't break before hard line breaks
3332 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3333 fBK->contains(thisChar)) {
3334 continue;
3335 }
3336
3337
3338 // LB 7 Don't break before spaces or zero-width space.
3339 if (fSP->contains(thisChar)) {
3340 continue;
3341 }
3342
3343 if (fZW->contains(thisChar)) {
3344 continue;
3345 }
3346
3347 // LB 8 Break after zero width space
3348 if (fZW->contains(prevChar)) {
3349 break;
3350 }
3351
3352 // LB 9, 10 Already done, at top of loop.
3353 //
3354
3355
3356 // LB 11 Do not break before or after WORD JOINER and related characters.
3357 // x WJ
3358 // WJ x
3359 //
3360 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3361 continue;
3362 }
3363
3364 // LB 12
73c04bcf 3365 // GL x
46f4442e 3366 if (fGL->contains(prevChar)) {
73c04bcf
A
3367 continue;
3368 }
3369
46f4442e
A
3370 // LB 12a
3371 // [^SP BA HY] x GL
3372 if (!(fSP->contains(prevChar) ||
3373 fBA->contains(prevChar) ||
3374 fHY->contains(prevChar) ) && fGL->contains(thisChar)) {
3375 continue;
3376 }
3377
3378
73c04bcf
A
3379
3380 // LB 13 Don't break before closings.
3381 // NU x CL and NU x IS are not matched here so that they will
3382 // fall into LB 17 and the more general number regular expression.
3383 //
3384 if (!fNU->contains(prevChar) && fCL->contains(thisChar) ||
3385 fEX->contains(thisChar) ||
3386 !fNU->contains(prevChar) && fIS->contains(thisChar) ||
3387 !fNU->contains(prevChar) && fSY->contains(thisChar)) {
3388 continue;
3389 }
3390
3391 // LB 14 Don't break after OP SP*
3392 // Scan backwards, checking for this sequence.
3393 // The OP char could include combining marks, so we actually check for
3394 // OP CM* SP*
3395 // Another Twist: The Rule 67 fixes may have changed a SP CM
3396 // sequence into a ID char, so before scanning back through spaces,
3397 // verify that prevChar is indeed a space. The prevChar variable
3398 // may differ from fText[prevPos]
3399 tPos = prevPos;
3400 if (fSP->contains(prevChar)) {
3401 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3402 tPos=fText->moveIndex32(tPos, -1);
3403 }
3404 }
3405 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3406 tPos=fText->moveIndex32(tPos, -1);
3407 }
3408 if (fOP->contains(fText->char32At(tPos))) {
3409 continue;
3410 }
3411
3412
3413 // LB 15 QU SP* x OP
3414 if (fOP->contains(thisChar)) {
3415 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3416 int tPos = prevPos;
3417 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3418 tPos = fText->moveIndex32(tPos, -1);
3419 }
3420 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3421 tPos = fText->moveIndex32(tPos, -1);
3422 }
3423 if (fQU->contains(fText->char32At(tPos))) {
3424 continue;
3425 }
3426 }
3427
3428
3429
3430 // LB 16 CL SP* x NS
3431 // Scan backwards for SP* CM* CL
3432 if (fNS->contains(thisChar)) {
3433 int tPos = prevPos;
3434 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3435 tPos = fText->moveIndex32(tPos, -1);
3436 }
3437 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3438 tPos = fText->moveIndex32(tPos, -1);
3439 }
3440 if (fCL->contains(fText->char32At(tPos))) {
3441 continue;
3442 }
3443 }
3444
3445
3446 // LB 17 B2 SP* x B2
3447 if (fB2->contains(thisChar)) {
3448 // Scan backwards, checking for the B2 CM* SP* sequence.
3449 tPos = prevPos;
3450 if (fSP->contains(prevChar)) {
3451 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3452 tPos=fText->moveIndex32(tPos, -1);
3453 }
3454 }
3455 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3456 tPos=fText->moveIndex32(tPos, -1);
3457 }
3458 if (fB2->contains(fText->char32At(tPos))) {
3459 continue;
3460 }
3461 }
3462
46f4442e 3463
73c04bcf
A
3464 // LB 18 break after space
3465 if (fSP->contains(prevChar)) {
3466 break;
3467 }
3468
3469 // LB 19
3470 // x QU
3471 // QU x
3472 if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3473 continue;
3474 }
3475
3476 // LB 20 Break around a CB
3477 if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3478 break;
3479 }
3480
3481 // LB 21
3482 if (fBA->contains(thisChar) ||
3483 fHY->contains(thisChar) ||
3484 fNS->contains(thisChar) ||
3485 fBB->contains(prevChar) ) {
3486 continue;
3487 }
3488
3489 // LB 22
3490 if (fAL->contains(prevChar) && fIN->contains(thisChar) ||
3491 fID->contains(prevChar) && fIN->contains(thisChar) ||
3492 fIN->contains(prevChar) && fIN->contains(thisChar) ||
3493 fNU->contains(prevChar) && fIN->contains(thisChar) ) {
3494 continue;
3495 }
3496
3497
3498 // LB 23 ID x PO
3499 // AL x NU
3500 // NU x AL
3501 if (fID->contains(prevChar) && fPO->contains(thisChar) ||
3502 fAL->contains(prevChar) && fNU->contains(thisChar) ||
3503 fNU->contains(prevChar) && fAL->contains(thisChar) ) {
3504 continue;
3505 }
3506
3507 // LB 24 Do not break between prefix and letters or ideographs.
3508 // PR x ID
3509 // PR x AL
3510 // PO x AL
3511 if (fPR->contains(prevChar) && fID->contains(thisChar) ||
3512 fPR->contains(prevChar) && fAL->contains(thisChar) ||
3513 fPO->contains(prevChar) && fAL->contains(thisChar) ) {
3514 continue;
3515 }
46f4442e
A
3516
3517
3518
73c04bcf
A
3519 // LB 25 Numbers
3520 if (fNumberMatcher->lookingAt(prevPos, status)) {
3521 if (U_FAILURE(status)) {
3522 break;
3523 }
3524 // Matched a number. But could have been just a single digit, which would
3525 // not represent a "no break here" between prevChar and thisChar
3526 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num
3527 if (numEndIdx > pos) {
3528 // Number match includes at least our two chars being checked
3529 if (numEndIdx > nextPos) {
3530 // Number match includes additional chars. Update pos and nextPos
3531 // so that next loop iteration will continue at the end of the number,
3532 // checking for breaks between last char in number & whatever follows.
3533 pos = nextPos = numEndIdx;
3534 do {
3535 pos = fText->moveIndex32(pos, -1);
3536 thisChar = fText->char32At(pos);
3537 } while (fCM->contains(thisChar));
3538 }
3539 continue;
3540 }
3541 }
3542
3543
3544 // LB 26 Do not break a Korean syllable.
3545 if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3546 fJV->contains(thisChar) ||
3547 fH2->contains(thisChar) ||
3548 fH3->contains(thisChar))) {
3549 continue;
3550 }
3551
3552 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) &&
3553 (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3554 continue;
3555 }
3556
3557 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3558 fJT->contains(thisChar)) {
3559 continue;
3560 }
3561
3562 // LB 27 Treat a Korean Syllable Block the same as ID.
3563 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3564 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3565 fIN->contains(thisChar)) {
3566 continue;
3567 }
3568 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3569 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3570 fPO->contains(thisChar)) {
3571 continue;
3572 }
3573 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3574 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3575 continue;
3576 }
3577
3578
3579
46f4442e 3580 // LB 28 Do not break between alphabetics ("at").
73c04bcf
A
3581 if (fAL->contains(prevChar) && fAL->contains(thisChar)) {
3582 continue;
3583 }
3584
3585 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g.").
3586 if (fIS->contains(prevChar) && fAL->contains(thisChar)) {
3587 continue;
3588 }
3589
73c04bcf
A
3590 // LB 31 Break everywhere else
3591 break;
3592
3593 }
3594
3595 return pos;
3596}
3597
3598
3599UVector *RBBILineMonkey::charClasses() {
3600 return fSets;
3601}
3602
3603
3604RBBILineMonkey::~RBBILineMonkey() {
3605 delete fSets;
3606
3607 delete fBK;
3608 delete fCR;
3609 delete fLF;
3610 delete fCM;
3611 delete fNL;
3612 delete fWJ;
3613 delete fZW;
3614 delete fGL;
3615 delete fCB;
3616 delete fSP;
3617 delete fB2;
3618 delete fBA;
3619 delete fBB;
3620 delete fHY;
3621 delete fH2;
3622 delete fH3;
3623 delete fCL;
3624 delete fEX;
3625 delete fIN;
3626 delete fJL;
3627 delete fJV;
3628 delete fJT;
3629 delete fNS;
3630 delete fOP;
3631 delete fQU;
3632 delete fIS;
3633 delete fNU;
3634 delete fPO;
3635 delete fPR;
3636 delete fSY;
3637 delete fAI;
3638 delete fAL;
3639 delete fID;
3640 delete fSA;
3641 delete fSG;
3642 delete fXX;
3643
3644 delete fCharBI;
3645 delete fNumberMatcher;
3646}
3647
3648
3649//-------------------------------------------------------------------------------------------
3650//
3651// TestMonkey
3652//
3653// params
3654// seed=nnnnn Random number starting seed.
3655// Setting the seed allows errors to be reproduced.
3656// loop=nnn Looping count. Controls running time.
3657// -1: run forever.
3658// 0 or greater: run length.
3659//
3660// type = char | word | line | sent | title
3661//
3662//-------------------------------------------------------------------------------------------
3663
3664static int32_t getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3665 int32_t val = defaultVal;
3666 name.append(" *= *(-?\\d+)");
3667 UErrorCode status = U_ZERO_ERROR;
3668 RegexMatcher m(name, params, 0, status);
3669 if (m.find()) {
3670 // The param exists. Convert the string to an int.
3671 char valString[100];
3672 int32_t paramLength = m.end(1, status) - m.start(1, status);
3673 if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3674 paramLength = (int32_t)(sizeof(valString)-2);
3675 }
3676 params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3677 val = strtol(valString, NULL, 10);
3678
3679 // Delete this parameter from the params string.
3680 m.reset();
3681 params = m.replaceFirst("", status);
3682 }
3683 U_ASSERT(U_SUCCESS(status));
3684 return val;
3685}
3686#endif
3687
3688static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3689 BreakIterator *bi,
3690 int expected[],
3691 int expectedcount)
3692{
3693 int count = 0;
3694 int i = 0;
3695 int forward[50];
3696 bi->setText(ustr);
3697 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3698 forward[count] = i;
3699 if (count < expectedcount && expected[count] != i) {
3700 test->errln("break forward test failed: expected %d but got %d",
3701 expected[count], i);
3702 break;
3703 }
3704 count ++;
3705 }
3706 if (count != expectedcount) {
3707 printStringBreaks(ustr, expected, expectedcount);
3708 test->errln("break forward test failed: missed %d match",
3709 expectedcount - count);
3710 return;
3711 }
3712 // testing boundaries
3713 for (i = 1; i < expectedcount; i ++) {
3714 int j = expected[i - 1];
3715 if (!bi->isBoundary(j)) {
3716 printStringBreaks(ustr, expected, expectedcount);
3717 test->errln("isBoundary() failed. Expected boundary at position %d", j);
3718 return;
3719 }
3720 for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3721 if (bi->isBoundary(j)) {
3722 printStringBreaks(ustr, expected, expectedcount);
3723 test->errln("isBoundary() failed. Not expecting boundary at position %d", j);
3724 return;
3725 }
3726 }
3727 }
3728
3729 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3730 count --;
3731 if (forward[count] != i) {
3732 test->errln("happy break test previous() failed: expected %d but got %d",
3733 forward[count], i);
3734 break;
3735 }
3736 }
3737 if (count != 0) {
3738 printStringBreaks(ustr, expected, expectedcount);
3739 test->errln("break test previous() failed: missed a match");
3740 return;
3741 }
3742
3743 // testing preceding
3744 for (i = 0; i < expectedcount - 1; i ++) {
3745 // int j = expected[i] + 1;
3746 int j = ustr.moveIndex32(expected[i], 1);
3747 for (; j <= expected[i + 1]; j ++) {
3748 if (bi->preceding(j) != expected[i]) {
3749 printStringBreaks(ustr, expected, expectedcount);
3750 test->errln("preceding(): Not expecting boundary at position %d", j);
3751 return;
3752 }
3753 }
3754 }
3755}
3756
3757void RBBITest::TestWordBreaks(void)
3758{
3759#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3760
73c04bcf
A
3761 Locale locale("en");
3762 UErrorCode status = U_ZERO_ERROR;
3763 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3764 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
73c04bcf
A
3765 static const char *strlist[] =
3766 {
3767 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3768 "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b",
46f4442e 3769 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
73c04bcf
A
3770 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3771 "\\u90ca\\u3588\\u009c\\u0953\\u194b",
3772 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3773 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3774 "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",
3775 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3776 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3777 "\\u2027\\U000e0067\\u0a47\\u00b7",
3778 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3779 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3780 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3781 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3782 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3783 "\\u0027\\u11af\\U000e0057\\u0602",
3784 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3785 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3786 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3787 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
46f4442e 3788 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
73c04bcf
A
3789 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3790 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3791 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3792 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3793 "\\u58f4\\U000e0049\\u20e7\\u2027",
3794 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3795 "\\ua183\\u102d\\u0bec\\u003a",
3796 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3797 "\\u003a\\u0e57\\u0fad\\u002e",
3798 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3799 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3800 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3801 "\\u003a\\u0664\\u00b7\\u1fba",
3802 "\\u003b\\u0027\\u00b7\\u47a3",
3803 "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",
3804 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3805 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3806 };
3807 int loop;
3808 if (U_FAILURE(status)) {
3809 errln("Creation of break iterator failed %s", u_errorName(status));
3810 return;
3811 }
3812 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3813 // printf("looping %d\n", loop);
46f4442e 3814 UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
73c04bcf
A
3815 // RBBICharMonkey monkey;
3816 RBBIWordMonkey monkey;
3817
3818 int expected[50];
3819 int expectedcount = 0;
3820
3821 monkey.setText(ustr);
3822 int i;
3823 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3824 expected[expectedcount ++] = i;
3825 }
3826
3827 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3828 }
3829 delete bi;
3830#endif
3831}
3832
3833void RBBITest::TestWordBoundary(void)
3834{
3835 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3836 Locale locale("en");
3837 UErrorCode status = U_ZERO_ERROR;
3838 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3839 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3840 UChar str[50];
3841 static const char *strlist[] =
3842 {
3843 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3844 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3845 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3846 "\\u2027\\U000e0067\\u0a47\\u00b7",
3847 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3848 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3849 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3850 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3851 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3852 "\\u0027\\u11af\\U000e0057\\u0602",
3853 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3854 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3855 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3856 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3857 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3858 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3859 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3860 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3861 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3862 "\\u58f4\\U000e0049\\u20e7\\u2027",
3863 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3864 "\\ua183\\u102d\\u0bec\\u003a",
3865 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3866 "\\u003a\\u0e57\\u0fad\\u002e",
3867 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3868 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3869 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3870 "\\u003a\\u0664\\u00b7\\u1fba",
3871 "\\u003b\\u0027\\u00b7\\u47a3",
3872 };
3873 int loop;
3874 if (U_FAILURE(status)) {
3875 errln("Creation of break iterator failed %s", u_errorName(status));
3876 return;
3877 }
3878 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3879 // printf("looping %d\n", loop);
3880 u_unescape(strlist[loop], str, 20);
3881 UnicodeString ustr(str);
3882 int forward[50];
3883 int count = 0;
3884
3885 bi->setText(ustr);
3886 int prev = 0;
3887 int i;
3888 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3889 forward[count ++] = i;
3890 if (i > prev) {
3891 int j;
3892 for (j = prev + 1; j < i; j ++) {
3893 if (bi->isBoundary(j)) {
3894 printStringBreaks(ustr, forward, count);
3895 errln("happy boundary test failed: expected %d not a boundary",
3896 j);
3897 return;
3898 }
3899 }
3900 }
3901 if (!bi->isBoundary(i)) {
3902 printStringBreaks(ustr, forward, count);
3903 errln("happy boundary test failed: expected %d a boundary",
3904 i);
3905 return;
3906 }
3907 prev = i;
3908 }
3909 }
3910 delete bi;
3911}
3912
3913void RBBITest::TestLineBreaks(void)
3914{
3915#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3916 Locale locale("en");
3917 UErrorCode status = U_ZERO_ERROR;
3918 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3919 const int32_t STRSIZE = 50;
3920 UChar str[STRSIZE];
3921 static const char *strlist[] =
3922 {
3923 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3924 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3925 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3926 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3927 "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3928 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3929 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3930 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3931 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3932 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3933 "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
3934 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3935 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3936 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3937 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3938 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3939 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3940 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3941 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3942 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3943 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3944 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3945 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3946 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3947 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3948 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3949 "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
3950 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3951 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3952 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3953 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3954 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3955 "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
3956 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3957 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3958 "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
3959 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3960 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3961 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3962 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3963 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3964 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3965 "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
3966 "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
3967 "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
3968 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3969 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3970 };
3971 int loop;
3972 TEST_ASSERT_SUCCESS(status);
3973 if (U_FAILURE(status)) {
3974 return;
3975 }
3976 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3977 // printf("looping %d\n", loop);
3978 int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3979 if (t >= STRSIZE) {
3980 TEST_ASSERT(FALSE);
3981 continue;
3982 }
3983
46f4442e 3984
73c04bcf
A
3985 UnicodeString ustr(str);
3986 RBBILineMonkey monkey;
3987 if (U_FAILURE(monkey.deferredStatus)) {
3988 continue;
3989 }
3990
3991 const int EXPECTEDSIZE = 50;
3992 int expected[EXPECTEDSIZE];
3993 int expectedcount = 0;
3994
3995 monkey.setText(ustr);
3996 int i;
3997 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3998 if (expectedcount >= EXPECTEDSIZE) {
3999 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
4000 return;
4001 }
4002 expected[expectedcount ++] = i;
4003 }
4004
4005 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4006 }
4007 delete bi;
4008#endif
4009}
4010
4011void RBBITest::TestSentBreaks(void)
4012{
4013#if !UCONFIG_NO_REGULAR_EXPRESSIONS
4014 Locale locale("en");
4015 UErrorCode status = U_ZERO_ERROR;
4016 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
4017 UChar str[200];
4018 static const char *strlist[] =
4019 {
4020 "Now\ris\nthe\r\ntime\n\rfor\r\r",
4021 "This\n",
4022 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
4023 "\"Sentence ending with a quote.\" Bye.",
4024 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
4025 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
4026 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
4027 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
4028 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
4029 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
4030 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
4031 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
4032 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
4033 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
4034 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
4035 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
4036 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
4037 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
4038 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
4039 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
4040 };
4041 int loop;
4042 if (U_FAILURE(status)) {
4043 errln("Creation of break iterator failed %s", u_errorName(status));
4044 return;
4045 }
4046 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
4047 u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
4048 UnicodeString ustr(str);
4049
4050 RBBISentMonkey monkey;
4051 if (U_FAILURE(monkey.deferredStatus)) {
4052 continue;
4053 }
4054
4055 const int EXPECTEDSIZE = 50;
4056 int expected[EXPECTEDSIZE];
4057 int expectedcount = 0;
4058
4059 monkey.setText(ustr);
4060 int i;
4061 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4062 if (expectedcount >= EXPECTEDSIZE) {
4063 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
4064 return;
4065 }
4066 expected[expectedcount ++] = i;
4067 }
4068
4069 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4070 }
4071 delete bi;
4072#endif
4073}
4074
4075void RBBITest::TestMonkey(char *params) {
4076#if !UCONFIG_NO_REGULAR_EXPRESSIONS
4077
4078 UErrorCode status = U_ZERO_ERROR;
4079 int32_t loopCount = 500;
4080 int32_t seed = 1;
4081 UnicodeString breakType = "all";
4082 Locale locale("en");
4083 UBool useUText = FALSE;
4084
4085 if (quick == FALSE) {
4086 loopCount = 10000;
4087 }
4088
4089 if (params) {
4090 UnicodeString p(params);
4091 loopCount = getIntParam("loop", p, loopCount);
4092 seed = getIntParam("seed", p, seed);
4093
4094 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
4095 if (m.find()) {
4096 breakType = m.group(1, status);
4097 m.reset();
4098 p = m.replaceFirst("", status);
4099 }
4100
4101 RegexMatcher u(" *utext", p, 0, status);
4102 if (u.find()) {
4103 useUText = TRUE;
4104 u.reset();
4105 p = u.replaceFirst("", status);
4106 }
4107
4108
4109 // m.reset(p);
46f4442e 4110 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
73c04bcf
A
4111 // Each option is stripped out of the option string as it is processed.
4112 // All options have been checked. The option string should have been completely emptied..
4113 char buf[100];
4114 p.extract(buf, sizeof(buf), NULL, status);
4115 buf[sizeof(buf)-1] = 0;
4116 errln("Unrecognized or extra parameter: %s\n", buf);
4117 return;
4118 }
4119
4120 }
4121
4122 if (breakType == "char" || breakType == "all") {
4123 RBBICharMonkey m;
4124 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
4125 if (U_SUCCESS(status)) {
4126 RunMonkey(bi, m, "char", seed, loopCount, useUText);
4127 if (breakType == "all" && useUText==FALSE) {
4128 // Also run a quick test with UText when "all" is specified
4129 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
4130 }
4131 }
4132 else {
4133 errln("Creation of character break iterator failed %s", u_errorName(status));
4134 }
4135 delete bi;
4136 }
4137
4138 if (breakType == "word" || breakType == "all") {
4139 logln("Word Break Monkey Test");
4140 RBBIWordMonkey m;
4141 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
4142 if (U_SUCCESS(status)) {
4143 RunMonkey(bi, m, "word", seed, loopCount, useUText);
4144 }
4145 else {
4146 errln("Creation of word break iterator failed %s", u_errorName(status));
4147 }
4148 delete bi;
4149 }
4150
4151 if (breakType == "line" || breakType == "all") {
4152 logln("Line Break Monkey Test");
4153 RBBILineMonkey m;
4154 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
4155 if (loopCount >= 10) {
4156 loopCount = loopCount / 5; // Line break runs slower than the others.
4157 }
4158 if (U_SUCCESS(status)) {
4159 RunMonkey(bi, m, "line", seed, loopCount, useUText);
4160 }
4161 else {
4162 errln("Creation of line break iterator failed %s", u_errorName(status));
4163 }
4164 delete bi;
4165 }
4166
46f4442e 4167 if (breakType == "sent" || breakType == "all" ) {
73c04bcf
A
4168 logln("Sentence Break Monkey Test");
4169 RBBISentMonkey m;
4170 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
4171 if (loopCount >= 10) {
4172 loopCount = loopCount / 10; // Sentence runs slower than the other break types
4173 }
4174 if (U_SUCCESS(status)) {
4175 RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
4176 }
4177 else {
4178 errln("Creation of line break iterator failed %s", u_errorName(status));
4179 }
4180 delete bi;
4181 }
4182
4183#endif
4184}
4185
4186//
4187// Run a RBBI monkey test. Common routine, for all break iterator types.
4188// Parameters:
4189// bi - the break iterator to use
4190// mk - MonkeyKind, abstraction for obtaining expected results
4191// name - Name of test (char, word, etc.) for use in error messages
4192// seed - Seed for starting random number generator (parameter from user)
4193// numIterations
4194//
46f4442e 4195void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed,
73c04bcf
A
4196 int32_t numIterations, UBool useUText) {
4197
4198#if !UCONFIG_NO_REGULAR_EXPRESSIONS
4199
4200 const int32_t TESTSTRINGLEN = 500;
4201 UnicodeString testText;
4202 int32_t numCharClasses;
4203 UVector *chClasses;
4204 int expected[TESTSTRINGLEN*2 + 1];
4205 int expectedCount = 0;
4206 char expectedBreaks[TESTSTRINGLEN*2 + 1];
4207 char forwardBreaks[TESTSTRINGLEN*2 + 1];
4208 char reverseBreaks[TESTSTRINGLEN*2+1];
4209 char isBoundaryBreaks[TESTSTRINGLEN*2+1];
4210 char followingBreaks[TESTSTRINGLEN*2+1];
4211 char precedingBreaks[TESTSTRINGLEN*2+1];
4212 int i;
4213 int loopCount = 0;
4214
4215 m_seed = seed;
4216
4217 numCharClasses = mk.charClasses()->size();
4218 chClasses = mk.charClasses();
4219
4220 // Check for errors that occured during the construction of the MonkeyKind object.
4221 // Can't report them where they occured because errln() is a method coming from intlTest,
4222 // and is not visible outside of RBBITest :-(
4223 if (U_FAILURE(mk.deferredStatus)) {
4224 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
4225 return;
4226 }
4227
4228 // Verify that the character classes all have at least one member.
4229 for (i=0; i<numCharClasses; i++) {
4230 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
4231 if (s == NULL || s->size() == 0) {
4232 errln("Character Class #%d is null or of zero size.", i);
4233 return;
4234 }
4235 }
4236
4237 while (loopCount < numIterations || numIterations == -1) {
4238 if (numIterations == -1 && loopCount % 10 == 0) {
4239 // If test is running in an infinite loop, display a periodic tic so
4240 // we can tell that it is making progress.
4241 fprintf(stderr, ".");
4242 }
4243 // Save current random number seed, so that we can recreate the random numbers
4244 // for this loop iteration in event of an error.
4245 seed = m_seed;
4246
4247 // Populate a test string with data.
4248 testText.truncate(0);
4249 for (i=0; i<TESTSTRINGLEN; i++) {
4250 int32_t aClassNum = m_rand() % numCharClasses;
4251 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4252 int32_t charIdx = m_rand() % classSet->size();
4253 UChar32 c = classSet->charAt(charIdx);
4254 if (c < 0) { // TODO: deal with sets containing strings.
4255 errln("c < 0");
4256 break;
4257 }
4258 testText.append(c);
4259 }
4260
4261 // Calculate the expected results for this test string.
4262 mk.setText(testText);
4263 memset(expectedBreaks, 0, sizeof(expectedBreaks));
4264 expectedBreaks[0] = 1;
4265 int32_t breakPos = 0;
4266 expectedCount = 0;
4267 for (;;) {
4268 breakPos = mk.next(breakPos);
4269 if (breakPos == -1) {
4270 break;
4271 }
4272 if (breakPos > testText.length()) {
4273 errln("breakPos > testText.length()");
4274 }
4275 expectedBreaks[breakPos] = 1;
4276 U_ASSERT(expectedCount<testText.length());
4277 expected[expectedCount ++] = breakPos;
4278 }
4279
4280 // Find the break positions using forward iteration
4281 memset(forwardBreaks, 0, sizeof(forwardBreaks));
4282 if (useUText) {
4283 UErrorCode status = U_ZERO_ERROR;
4284 UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4285 // testUText = utext_openUnicodeString(testUText, &testText, &status);
4286 bi->setText(testUText, status);
4287 TEST_ASSERT_SUCCESS(status);
4288 utext_close(testUText); // The break iterator does a shallow clone of the UText
4289 // This UText can be closed immediately, so long as the
4290 // testText string continues to exist.
4291 } else {
4292 bi->setText(testText);
4293 }
4294
4295 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4296 if (i < 0 || i > testText.length()) {
4297 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4298 break;
4299 }
4300 forwardBreaks[i] = 1;
4301 }
4302
4303 // Find the break positions using reverse iteration
4304 memset(reverseBreaks, 0, sizeof(reverseBreaks));
4305 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4306 if (i < 0 || i > testText.length()) {
4307 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4308 break;
4309 }
4310 reverseBreaks[i] = 1;
4311 }
4312
4313 // Find the break positions using isBoundary() tests.
4314 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4315 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4316 for (i=0; i<=testText.length(); i++) {
4317 isBoundaryBreaks[i] = bi->isBoundary(i);
4318 }
4319
4320
4321 // Find the break positions using the following() function.
4322 // printf(".");
4323 memset(followingBreaks, 0, sizeof(followingBreaks));
4324 int32_t lastBreakPos = 0;
4325 followingBreaks[0] = 1;
4326 for (i=0; i<testText.length(); i++) {
4327 breakPos = bi->following(i);
4328 if (breakPos <= i ||
4329 breakPos < lastBreakPos ||
4330 breakPos > testText.length() ||
4331 breakPos > lastBreakPos && lastBreakPos > i ) {
4332 errln("%s break monkey test: "
4333 "Out of range value returned by BreakIterator::following().\n"
4334 "Random seed=%d index=%d; following returned %d; lastbreak=%d",
4335 name, seed, i, breakPos, lastBreakPos);
4336 break;
4337 }
4338 followingBreaks[breakPos] = 1;
4339 lastBreakPos = breakPos;
4340 }
4341
4342 // Find the break positions using the preceding() function.
46f4442e 4343 memset(precedingBreaks, 0, sizeof(precedingBreaks));
73c04bcf
A
4344 lastBreakPos = testText.length();
4345 precedingBreaks[testText.length()] = 1;
4346 for (i=testText.length(); i>0; i--) {
4347 breakPos = bi->preceding(i);
4348 if (breakPos >= i ||
4349 breakPos > lastBreakPos ||
4350 breakPos < 0 && testText.getChar32Start(i)>0 ||
4351 breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i) ) {
4352 errln("%s break monkey test: "
4353 "Out of range value returned by BreakIterator::preceding().\n"
4354 "index=%d; prev returned %d; lastBreak=%d" ,
4355 name, i, breakPos, lastBreakPos);
46f4442e
A
4356 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4357 precedingBreaks[i] = 2; // Forces an error.
4358 }
73c04bcf 4359 } else {
46f4442e
A
4360 if (breakPos >= 0) {
4361 precedingBreaks[breakPos] = 1;
4362 }
73c04bcf
A
4363 lastBreakPos = breakPos;
4364 }
4365 }
4366
4367 // Compare the expected and actual results.
4368 for (i=0; i<=testText.length(); i++) {
4369 const char *errorType = NULL;
4370 if (forwardBreaks[i] != expectedBreaks[i]) {
4371 errorType = "next()";
4372 } else if (reverseBreaks[i] != forwardBreaks[i]) {
4373 errorType = "previous()";
4374 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4375 errorType = "isBoundary()";
4376 } else if (followingBreaks[i] != expectedBreaks[i]) {
4377 errorType = "following()";
4378 } else if (precedingBreaks[i] != expectedBreaks[i]) {
4379 errorType = "preceding()";
4380 }
4381
4382
4383 if (errorType != NULL) {
4384 // Format a range of the test text that includes the failure as
4385 // a data item that can be included in the rbbi test data file.
4386
4387 // Start of the range is the last point where expected and actual results
4388 // both agreed that there was a break position.
4389 int startContext = i;
4390 int32_t count = 0;
4391 for (;;) {
4392 if (startContext==0) { break; }
4393 startContext --;
4394 if (expectedBreaks[startContext] != 0) {
4395 if (count == 2) break;
4396 count ++;
4397 }
4398 }
4399
4400 // End of range is two expected breaks past the start position.
4401 int endContext = i + 1;
4402 int ci;
4403 for (ci=0; ci<2; ci++) { // Number of items to include in error text.
4404 for (;;) {
4405 if (endContext >= testText.length()) {break;}
4406 if (expectedBreaks[endContext-1] != 0) {
4407 if (count == 0) break;
4408 count --;
4409 }
4410 endContext ++;
4411 }
4412 }
4413
4414 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4415 UnicodeString errorText = "<data>";
4416 /***if (strcmp(errorType, "next()") == 0) {
4417 startContext = 0;
4418 endContext = testText.length();
4419
4420 printStringBreaks(testText, expected, expectedCount);
4421 }***/
4422
4423 for (ci=startContext; ci<endContext;) {
4424 UnicodeString hexChars("0123456789abcdef");
4425 UChar32 c;
4426 int bn;
4427 c = testText.char32At(ci);
4428 if (ci == i) {
4429 // This is the location of the error.
4430 errorText.append("<?>");
4431 } else if (expectedBreaks[ci] != 0) {
4432 // This a non-error expected break position.
4433 errorText.append("\\");
4434 }
4435 if (c < 0x10000) {
4436 errorText.append("\\u");
4437 for (bn=12; bn>=0; bn-=4) {
4438 errorText.append(hexChars.charAt((c>>bn)&0xf));
4439 }
4440 } else {
4441 errorText.append("\\U");
4442 for (bn=28; bn>=0; bn-=4) {
4443 errorText.append(hexChars.charAt((c>>bn)&0xf));
4444 }
4445 }
4446 ci = testText.moveIndex32(ci, 1);
4447 }
4448 errorText.append("\\");
4449 errorText.append("</data>\n");
4450
4451 // Output the error
4452 char charErrorTxt[500];
4453 UErrorCode status = U_ZERO_ERROR;
4454 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4455 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4456 errln("%s break monkey test error. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s",
4457 name, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
4458 errorType, seed, i, charErrorTxt);
4459 break;
4460 }
4461 }
4462
4463 loopCount++;
4464 }
4465#endif
4466}
4467
4468//
4469// TestDebug - A place-holder test for debugging purposes.
4470// For putting in fragments of other tests that can be invoked
4471// for tracing without a lot of unwanted extra stuff happening.
4472//
4473void RBBITest::TestDebug(void) {
4474#if 0
4475 UErrorCode status = U_ZERO_ERROR;
4476 int pos = 0;
4477 int ruleStatus = 0;
4478
4479 RuleBasedBreakIterator* bi =
4480 // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
4481 // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
4482 (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
4483 UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
4484 // UnicodeString s("Aaa. Bcd");
4485 s = s.unescape();
4486 bi->setText(s);
4487 UBool r = bi->isBoundary(8);
4488 printf("%s", r?"true":"false");
4489 return;
4490 pos = bi->last();
4491 do {
4492 // ruleStatus = bi->getRuleStatus();
4493 printf("%d\t%d\n", pos, ruleStatus);
4494 pos = bi->previous();
4495 } while (pos != BreakIterator::DONE);
4496#endif
4497}
4498
4499#endif /* #if !UCONFIG_NO_BREAK_ITERATION */