]> git.saurik.com Git - apple/icu.git/blame - icuSources/test/intltest/rbbitst.cpp
ICU-8.11.tar.gz
[apple/icu.git] / icuSources / test / intltest / rbbitst.cpp
CommitLineData
73c04bcf
A
1/********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 1999-2006, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6/************************************************************************
7* Date Name Description
8* 12/15/99 Madhu Creation.
9* 01/12/2000 Madhu Updated for changed API and added new tests
10************************************************************************/
11
12#include "unicode/utypes.h"
13
14#if !UCONFIG_NO_BREAK_ITERATION
15
16#include "unicode/utypes.h"
17#include "unicode/brkiter.h"
18#include "unicode/rbbi.h"
19#include "unicode/uchar.h"
20#include "unicode/utf16.h"
21#include "unicode/ucnv.h"
22#include "unicode/schriter.h"
23#include "unicode/uniset.h"
24#include "unicode/regex.h" // TODO: make conditional on regexp being built.
25#include "unicode/ustring.h"
26#include "unicode/utext.h"
27#include "intltest.h"
28#include "rbbitst.h"
29#include <string.h>
30#include "uvector.h"
31#include "uvectr32.h"
32#include "triedict.h"
33#include <string.h>
34#include <stdio.h>
35#include <stdlib.h>
36
37#define TEST_ASSERT(x) {if (!(x)) { \
38 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
39
40#define TEST_ASSERT_SUCCESS(errcode) {if (U_FAILURE(errcode)) { \
41 errln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
42
43
44//---------------------------------------------------------------------------
45//
46// class BITestData Holds a set of Break iterator test data and results
47// Includes
48// - the string data to be broken
49// - a vector of the expected break positions.
50// - a vector of source line numbers for the data,
51// (to help see where errors occured.)
52// - The expected break tag values.
53// - Vectors of actual break positions and tag values.
54// - Functions for comparing actual with expected and
55// reporting errors.
56//
57//----------------------------------------------------------------------------
58class BITestData {
59public:
60 UnicodeString fDataToBreak;
61 UVector fExpectedBreakPositions;
62 UVector fExpectedTags;
63 UVector fLineNum;
64 UVector fActualBreakPositions; // Test Results.
65 UVector fActualTags;
66
67 BITestData(UErrorCode &status);
68 void addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
69 void checkResults(const char *heading, RBBITest *test);
70 void err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
71 void clearResults();
72};
73
74//
75// Constructor.
76//
77BITestData::BITestData(UErrorCode &status)
78: fExpectedBreakPositions(status), fExpectedTags(status), fLineNum(status), fActualBreakPositions(status),
79 fActualTags(status)
80{
81}
82
83//
84// addDataChunk. Add a section (non-breaking) piece if data to the test data.
85// The macro form collects the line number, which is helpful
86// when tracking down failures.
87//
88// A null data item is inserted at the start of each test's data
89// to put the starting zero into the data list. The position saved for
90// each non-null item is its ending position.
91//
92#define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status);
93void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
94 if (U_FAILURE(status)) {return;}
95 if (data != NULL) {
96 fDataToBreak.append(CharsToUnicodeString(data));
97 }
98 fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
99 fExpectedTags.addElement(tag, status);
100 fLineNum.addElement(lineNum, status);
101}
102
103
104//
105// checkResults. Compare the actual and expected break positions, report any differences.
106//
107void BITestData::checkResults(const char *heading, RBBITest *test) {
108 int32_t expectedIndex = 0;
109 int32_t actualIndex = 0;
110
111 for (;;) {
112 // If we've run through both the expected and actual results vectors, we're done.
113 // break out of the loop.
114 if (expectedIndex >= fExpectedBreakPositions.size() &&
115 actualIndex >= fActualBreakPositions.size()) {
116 break;
117 }
118
119
120 if (expectedIndex >= fExpectedBreakPositions.size()) {
121 err(heading, test, expectedIndex-1, actualIndex);
122 actualIndex++;
123 continue;
124 }
125
126 if (actualIndex >= fActualBreakPositions.size()) {
127 err(heading, test, expectedIndex, actualIndex-1);
128 expectedIndex++;
129 continue;
130 }
131
132 if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
133 err(heading, test, expectedIndex, actualIndex);
134 // Try to resync the positions of the indices, to avoid a rash of spurious erros.
135 if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
136 actualIndex++;
137 } else {
138 expectedIndex++;
139 }
140 continue;
141 }
142
143 if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
144 test->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d",
145 heading, fLineNum.elementAt(expectedIndex),
146 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
147 }
148
149 actualIndex++;
150 expectedIndex++;
151 }
152}
153
154//
155// err - An error was found. Report it, along with information about where the
156// incorrectly broken test data appeared in the source file.
157//
158void BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
159{
160 int32_t expected = fExpectedBreakPositions.elementAti(expectedIdx);
161 int32_t actual = fActualBreakPositions.elementAti(actualIdx);
162 int32_t o = 0;
163 int32_t line = fLineNum.elementAti(expectedIdx);
164 if (expectedIdx > 0) {
165 // The line numbers are off by one because a premature break occurs somewhere
166 // within the previous item, rather than at the start of the current (expected) item.
167 // We want to report the offset of the unexpected break from the start of
168 // this previous item.
169 o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
170 }
171 if (actual < expected) {
172 test->errln("%s unexpected break at offset %d in test item from line %d", heading, o, line);
173 } else {
174 test->errln("%s Failed to find break at end of item from line %d", heading, line);
175 }
176}
177
178
179void BITestData::clearResults() {
180 fActualBreakPositions.removeAllElements();
181 fActualTags.removeAllElements();
182}
183
184
185//-----------------------------------------------------------------------------------
186//
187// Cannned Test Characters
188//
189//-----------------------------------------------------------------------------------
190
191static const UChar cannedTestArray[] = {
192 0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031,
193 0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b,
194 0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2,
195 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3,
196 0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303,
197 0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000,
198 0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f,
199 0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000
200};
201
202static UnicodeString* cannedTestChars = 0;
203
204#define halfNA "\\u0928\\u094d\\u200d"
205#define halfSA "\\u0938\\u094d\\u200d"
206#define halfCHA "\\u091a\\u094d\\u200d"
207#define halfKA "\\u0915\\u094d\\u200d"
208#define deadTA "\\u0924\\u094d"
209
210//--------------------------------------------------------------------------------------
211//
212// RBBITest constructor and destructor
213//
214//--------------------------------------------------------------------------------------
215
216RBBITest::RBBITest() {
217 UnicodeString temp(cannedTestArray);
218 cannedTestChars = new UnicodeString();
219 *cannedTestChars += (UChar)0x0000;
220 *cannedTestChars += temp;
221}
222
223
224RBBITest::~RBBITest() {
225 delete cannedTestChars;
226}
227
228
229static const int T_NUMBER = 100;
230static const int T_LETTER = 200;
231static const int T_H_OR_K = 300;
232static const int T_IDEO = 400;
233
234
235
236
237
238
239//--------------------------------------------------------------------
240//Testing the BreakIterator for devanagari script
241//--------------------------------------------------------------------
242
243#define deadRA "\\u0930\\u094d" /*deadform RA = devanagari RA + virama*/
244#define deadPHA "\\u092b\\u094d" /*deadform PHA = devanagari PHA + virama*/
245#define deadTTHA "\\u0920\\u094d"
246#define deadPA "\\u092a\\u094d"
247#define deadSA "\\u0938\\u094d"
248#define visarga "\\u0903" /*devanagari visarga looks like a english colon*/
249
250
251
252
253
254
255//-----------------------------------------------------------------------------------
256//
257// Test for status {tag} return value from break rules.
258// TODO: a more thorough test.
259//
260//-----------------------------------------------------------------------------------
261void RBBITest::TestStatusReturn() {
262 UnicodeString rulesString1 = "$Letters = [:L:];\n"
263 "$Numbers = [:N:];\n"
264 "$Letters+{1};\n"
265 "$Numbers+{2};\n"
266 "Help\\ {4}/me\\!;\n"
267 "[^$Letters $Numbers];\n"
268 "!.*;\n";
269 UnicodeString testString1 = "abc123..abc Help me Help me!";
270 // 01234567890123456789012345678
271 int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
272 int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1};
273
274 UErrorCode status=U_ZERO_ERROR;
275 UParseError parseError;
276
277 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
278 if(U_FAILURE(status)) {
279 errln("FAIL : in construction");
280 } else {
281 int32_t pos;
282 int32_t i = 0;
283 bi->setText(testString1);
284 for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
285 if (pos != bounds1[i]) {
286 errln("FAIL: expected break at %d, got %d\n", bounds1[i], pos);
287 break;
288 }
289
290 int tag = bi->getRuleStatus();
291 if (tag != brkStatus[i]) {
292 errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
293 break;
294 }
295 i++;
296 }
297 }
298 delete bi;
299}
300
301
302static void printStringBreaks(UnicodeString ustr, int expected[],
303 int expectedcount)
304{
305 UErrorCode status = U_ZERO_ERROR;
306 char name[100];
307 printf("code alpha extend alphanum type word sent line name\n");
308 int j;
309 for (j = 0; j < ustr.length(); j ++) {
310 if (expectedcount > 0) {
311 int k;
312 for (k = 0; k < expectedcount; k ++) {
313 if (j == expected[k]) {
314 printf("------------------------------------------------ %d\n",
315 j);
316 }
317 }
318 }
319 UChar32 c = ustr.char32At(j);
320 if (c > 0xffff) {
321 j ++;
322 }
323 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
324 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
325 u_isUAlphabetic(c),
326 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
327 u_isalnum(c),
328 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
329 u_charType(c),
330 U_SHORT_PROPERTY_NAME),
331 u_getPropertyValueName(UCHAR_WORD_BREAK,
332 u_getIntPropertyValue(c,
333 UCHAR_WORD_BREAK),
334 U_SHORT_PROPERTY_NAME),
335 u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
336 u_getIntPropertyValue(c,
337 UCHAR_SENTENCE_BREAK),
338 U_SHORT_PROPERTY_NAME),
339 u_getPropertyValueName(UCHAR_LINE_BREAK,
340 u_getIntPropertyValue(c,
341 UCHAR_LINE_BREAK),
342 U_SHORT_PROPERTY_NAME),
343 name);
344 }
345}
346
347void RBBITest::TestThaiLineBreak() {
348 UErrorCode status = U_ZERO_ERROR;
349 BITestData thaiLineSelection(status);
350
351 // \u0e2f-- the Thai paiyannoi character-- isn't a letter. It's a symbol that
352 // represents elided letters at the end of a long word. It should be bound to
353 // the end of the word and not treated as an independent punctuation mark.
354
355
356 ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data
357 ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f", 0, status);
358 ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e30", 0, status);
359 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e21", 0, status);
360 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e08\\u0e49\\u0e32", 0, status);
361// ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32", 0, status);
362// ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
363 ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48", 0, status);
364 // the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us
365 ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e2d\\u0e01", 0, status);
366 ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32", 0, status);
367 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e23\\u0e48\\u0e07", 0, status);
368 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22", 0, status);
369 ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07", 0, status);
370 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e15\\u0e47\\u0e21", 0, status);
371
372 // the one time where the paiyannoi occurs somewhere other than at the end
373 // of a word is in the Thai abbrevation for "etc.", which both begins and
374 // ends with a paiyannoi
375 ADD_DATACHUNK(thaiLineSelection, "\\u0e2f\\u0e25\\u0e2f", 0, status);
376 ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
377 ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19", 0, status);
378
379 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
380 Locale("th"), status);
381 if (U_FAILURE(status))
382 {
383 errln("Failed to create the BreakIterator for Thai locale in TestThaiLineBreak.\n");
384 return;
385 }
386
387 generalIteratorTest(*e, thaiLineSelection);
388 delete e;
389}
390
391
392
393void RBBITest::TestMixedThaiLineBreak()
394{
395 UErrorCode status = U_ZERO_ERROR;
396 BITestData thaiLineSelection(status);
397
398 ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data
399
400
401 // @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters
402 // start
403
404 ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
405 ADD_DATACHUNK(thaiLineSelection, "\\u0E1E\\u0E38\\u0E17\\u0E18\\u0E28\\u0E31\\u0E01\\u0E23\\u0E32\\u0E0A ", 0, status);
406 ADD_DATACHUNK(thaiLineSelection, "2545 ", 0, status);
407 ADD_DATACHUNK(thaiLineSelection, "\\u0E40\\u0E1B\\u0E47\\u0E19", 0, status);
408 ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
409 ADD_DATACHUNK(thaiLineSelection, "\\u0E09\\u0E25\\u0E2D\\u0E07", 0, status);
410 ADD_DATACHUNK(thaiLineSelection, "\\u0E04\\u0E23\\u0E1A", 0, status);
411 ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E2D\\u0E1A ", 0, status);
412 ADD_DATACHUNK(thaiLineSelection, "\"\\u0E52\\u0E52\\u0E50 ", 0, status);
413 ADD_DATACHUNK(thaiLineSelection, "\\u0E1b\\u0E35\" ", 0, status);
414 ADD_DATACHUNK(thaiLineSelection, "\\u0E02\\u0E2d\\u0E07", 0, status);
415 ADD_DATACHUNK(thaiLineSelection, "\\u0E01\\u0E23\\u0E38\\u0E07", 0, status);
416 ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E31\\u0E15\\u0E19\\u0E42\\u0E01\\u0E2A\\u0E34\\u0E19\\u0E17\\u0E23\\u0E4C ", 0, status);
417 ADD_DATACHUNK(thaiLineSelection, "(\\u0E01\\u0E23\\u0E38\\u0E07\\u0E40\\u0E17\\u0E1e\\u0E2F", 0, status);
418 ADD_DATACHUNK(thaiLineSelection, "\\u0E2B\\u0E23\\u0E37\\u0E2D ", 0, status);
419 ADD_DATACHUNK(thaiLineSelection, "Bangkok)", 0, status);
420
421 // @suwit - end of changes
422
423
424 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale("th"), status);
425 if (U_FAILURE(status))
426 {
427 errln("Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak.\n");
428 return;
429 }
430
431
432 generalIteratorTest(*e, thaiLineSelection);
433 delete e;
434}
435
436
437void RBBITest::TestMaiyamok()
438{
439 UErrorCode status = U_ZERO_ERROR;
440 BITestData thaiLineSelection(status);
441 ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data
442 // the Thai maiyamok character is a shorthand symbol that means "repeat the previous
443 // word". Instead of appearing as a word unto itself, however, it's kept together
444 // with the word before it
445 ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e1b\\u0e46", 0, status);
446 ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32\\u0e46", 0, status);
447 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07", 0, status);
448 ADD_DATACHUNK(thaiLineSelection, "\\u0e01\\u0e23\\u0e38\\u0e07", 0, status);
449 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e17\\u0e1e", 0, status);
450 ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e25\\u0e30", 0, status);
451 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e03\\u0e35", 0, status);
452 ADD_DATACHUNK(thaiLineSelection, "\\u0e22\\u0e07", 0, status);
453 ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status);
454
455 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
456 Locale("th"), status);
457
458 if (U_FAILURE(status))
459 {
460 errln("Failed to create the BreakIterator for Thai locale in TestMaiyamok.\n");
461 return;
462 }
463 generalIteratorTest(*e, thaiLineSelection);
464 delete e;
465}
466
467
468
469void RBBITest::TestBug3818() {
470 UErrorCode status = U_ZERO_ERROR;
471
472 // Four Thai words...
473 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
474 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
475 UnicodeString thaiStr(thaiWordData);
476
477 RuleBasedBreakIterator* bi =
478 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status);
479 if (U_FAILURE(status) || bi == NULL) {
480 errln("Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
481 return;
482 }
483 bi->setText(thaiStr);
484
485 int32_t startOfSecondWord = bi->following(1);
486 if (startOfSecondWord != 4) {
487 errln("Fail at file %s, line %d expected start of word at 4, got %d",
488 __FILE__, __LINE__, startOfSecondWord);
489 }
490 startOfSecondWord = bi->following(0);
491 if (startOfSecondWord != 4) {
492 errln("Fail at file %s, line %d expected start of word at 4, got %d",
493 __FILE__, __LINE__, startOfSecondWord);
494 }
495 delete bi;
496}
497
498
499void RBBITest::TestJapaneseWordBreak() {
500 UErrorCode status = U_ZERO_ERROR;
501 BITestData japaneseWordSelection(status);
502
503 ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status); // Break at start of data
504 ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); //2
505 ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status); //5
506 ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); //7
507 ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status); //10
508 ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); //11
509 ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status); //12
510
511 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(
512 Locale("ja"), status);
513 if (U_FAILURE(status))
514 {
515 errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseWordBreak.\n");
516 return;
517 }
518
519 generalIteratorTest(*e, japaneseWordSelection);
520 delete e;
521}
522
523void RBBITest::TestTrieDict() {
524 UErrorCode status = U_ZERO_ERROR;
525
526 //
527 // Open and read the test data file.
528 //
529 const char *testDataDirectory = IntlTest::getSourceTestData(status);
530 char testFileName[1000];
531 if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen("riwords.txt") + 10 >= sizeof(testFileName)) {
532 errln("Can't open test data. Path too long.");
533 return;
534 }
535 strcpy(testFileName, testDataDirectory);
536 strcat(testFileName, "riwords.txt");
537
538 // Items needing deleting at the end
539 MutableTrieDictionary *mutableDict = NULL;
540 CompactTrieDictionary *compactDict = NULL;
541 UnicodeSet *breaks = NULL;
542 UChar *testFile = NULL;
543 StringEnumeration *enumer = NULL;
544 MutableTrieDictionary *mutable2 = NULL;
545 StringEnumeration *cloneEnum = NULL;
546 CompactTrieDictionary *compact2 = NULL;
547
548
549 const UnicodeString *originalWord = NULL;
550 const UnicodeString *cloneWord = NULL;
551 UChar *current;
552 UChar *word;
553 UChar uc;
554 int32_t wordLen;
555 int32_t wordCount;
556 int32_t testCount;
557
558 int len;
559 testFile = ReadAndConvertFile(testFileName, len, status);
560 if (U_FAILURE(status)) {
561 goto cleanup; /* something went wrong, error already output */
562 }
563
564 mutableDict = new MutableTrieDictionary(0x0E1C, status);
565 if (U_FAILURE(status)) {
566 errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status));
567 goto cleanup;
568 }
569
570 breaks = new UnicodeSet;
571 breaks->add(0x000A); // Line Feed
572 breaks->add(0x000D); // Carriage Return
573 breaks->add(0x2028); // Line Separator
574 breaks->add(0x2029); // Paragraph Separator
575
576 // Now add each non-comment line of the file as a word.
577 current = testFile;
578 word = current;
579 uc = *current++;
580 wordLen = 0;
581 wordCount = 0;
582
583 while (uc) {
584 if (uc == 0x0023) { // #comment line, skip
585 while (uc && !breaks->contains(uc)) {
586 uc = *current++;
587 }
588 }
589 else while (uc && !breaks->contains(uc)) {
590 ++wordLen;
591 uc = *current++;
592 }
593 if (wordLen > 0) {
594 mutableDict->addWord(word, wordLen, status);
595 if (U_FAILURE(status)) {
596 errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status));
597 goto cleanup;
598 }
599 wordCount += 1;
600 }
601
602 // Find beginning of next line
603 while (uc && breaks->contains(uc)) {
604 uc = *current++;
605 }
606 word = current-1;
607 wordLen = 0;
608 }
609
610 if (wordCount < 50) {
611 errln("Word count (%d) unreasonably small\n", wordCount);
612 goto cleanup;
613 }
614
615 enumer = mutableDict->openWords(status);
616 if (U_FAILURE(status)) {
617 errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status));
618 goto cleanup;
619 }
620
621 testCount = 0;
622 if (wordCount != (testCount = enumer->count(status))) {
623 errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
624 testCount, wordCount, u_errorName(status));
625 goto cleanup;
626 }
627
628 delete enumer;
629 enumer = NULL;
630
631 // Now compact it
632 compactDict = new CompactTrieDictionary(*mutableDict, status);
633 if (U_FAILURE(status)) {
634 errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status));
635 goto cleanup;
636 }
637
638 enumer = compactDict->openWords(status);
639 if (U_FAILURE(status)) {
640 errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status));
641 goto cleanup;
642 }
643
644 if (wordCount != (testCount = enumer->count(status))) {
645 errln("CompactTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
646 testCount, wordCount, u_errorName(status));
647 goto cleanup;
648 }
649
650 delete enumer;
651 enumer = NULL;
652
653 // Now un-compact it
654 mutable2 = compactDict->cloneMutable(status);
655 if (U_FAILURE(status)) {
656 errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status));
657 goto cleanup;
658 }
659
660 cloneEnum = mutable2->openWords(status);
661 if (U_FAILURE(status)) {
662 errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status));
663 goto cleanup;
664 }
665
666 if (wordCount != (testCount = cloneEnum->count(status))) {
667 errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
668 testCount, wordCount, u_errorName(status));
669 goto cleanup;
670 }
671
672 // Compact original dictionary to clone. Note that we can only compare the same kind of
673 // dictionary as the order of the enumerators is not guaranteed to be the same between
674 // different kinds
675 enumer = mutableDict->openWords(status);
676 if (U_FAILURE(status)) {
677 errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status));
678 goto cleanup;
679 }
680
681 originalWord = enumer->snext(status);
682 cloneWord = cloneEnum->snext(status);
683 while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
684 if (*originalWord != *cloneWord) {
685 errln("Original and cloned MutableTrieDictionary word mismatch\n");
686 goto cleanup;
687 }
688 originalWord = enumer->snext(status);
689 cloneWord = cloneEnum->snext(status);
690 }
691
692 if (U_FAILURE(status)) {
693 errln("Enumeration failed: %s\n", u_errorName(status));
694 goto cleanup;
695 }
696
697 if (originalWord != cloneWord) {
698 errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n");
699 goto cleanup;
700 }
701
702 // Test the data copying constructor for CompactTrieDict, and the data access APIs.
703 compact2 = new CompactTrieDictionary(compactDict->data(), status);
704 if (U_FAILURE(status)) {
705 errln("CompactTrieDictionary(const void *,...) failed\n");
706 goto cleanup;
707 }
708
709 if (compact2->dataSize() == 0) {
710 errln("CompactTrieDictionary->dataSize() == 0\n");
711 goto cleanup;
712 }
713
714 // Now count the words via the second dictionary
715 delete enumer;
716 enumer = compact2->openWords(status);
717 if (U_FAILURE(status)) {
718 errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status));
719 goto cleanup;
720 }
721
722 if (wordCount != (testCount = enumer->count(status))) {
723 errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n",
724 testCount, wordCount, u_errorName(status));
725 goto cleanup;
726 }
727
728cleanup:
729 delete compactDict;
730 delete mutableDict;
731 delete breaks;
732 delete[] testFile;
733 delete enumer;
734 delete mutable2;
735 delete cloneEnum;
736 delete compact2;
737}
738
739//---------------------------------------------
740// runIndexedTest
741//---------------------------------------------
742
743void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
744{
745 if (exec) logln("TestSuite RuleBasedBreakIterator: ");
746
747 switch (index) {
748 case 0: name = "TestBug4153072";
749 if(exec) TestBug4153072(); break;
750 case 1: name = "TestJapaneseLineBreak";
751 if(exec) TestJapaneseLineBreak(); break;
752 case 2: name = "TestStatusReturn";
753 if(exec) TestStatusReturn(); break;
754
755 case 3: name = "TestLineBreakData";
756 if(exec) TestLineBreakData(); break;
757 case 4: name = "TestEmptyString";
758 if(exec) TestEmptyString(); break;
759
760 case 5: name = "TestGetAvailableLocales";
761 if(exec) TestGetAvailableLocales(); break;
762
763 case 6: name = "TestGetDisplayName";
764 if(exec) TestGetDisplayName(); break;
765
766 case 7: name = "TestEndBehaviour";
767 if(exec) TestEndBehaviour(); break;
768 case 8: name = "TestMixedThaiLineBreak";
769 if(exec) TestMixedThaiLineBreak(); break;
770 case 9: name = "TestThaiLineBreak";
771 if(exec) TestThaiLineBreak(); break;
772 case 10: name = "TestMaiyamok";
773 if(exec) TestMaiyamok(); break;
774 case 11: name = "TestWordBreaks";
775 if(exec) TestWordBreaks(); break;
776 case 12: name = "TestWordBoundary";
777 if(exec) TestWordBoundary(); break;
778 case 13: name = "TestLineBreaks";
779 if(exec) TestLineBreaks(); break;
780 case 14: name = "TestSentBreaks";
781 if(exec) TestSentBreaks(); break;
782 case 15: name = "TestExtended";
783 if(exec) TestExtended(); break;
784 case 16: name = "TestMonkey";
785 if(exec) {
786 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
787 TestMonkey(params);
788 #else
789 logln("skipping TestMonkey (UCONFIG_NO_REGULAR_EXPRESSIONS)");
790 #endif
791 }
792 break;
793 case 17: name = "TestBug3818";
794 if(exec) TestBug3818(); break;
795 case 18: name = "TestJapaneseWordBreak";
796 if(exec) TestJapaneseWordBreak(); break;
797 case 19: name = "TestDebug";
798 if(exec) TestDebug(); break;
799 case 20: name = "TestTrieDict";
800 if(exec) TestTrieDict(); break;
801
802 default: name = ""; break; //needed to end loop
803 }
804}
805
806
807//----------------------------------------------------------------------------
808//
809// generalIteratorTest Given a break iterator and a set of test data,
810// Run the tests and report the results.
811//
812//----------------------------------------------------------------------------
813void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
814{
815
816 bi.setText(td.fDataToBreak);
817
818 testFirstAndNext(bi, td);
819
820 testLastAndPrevious(bi, td);
821
822 testFollowing(bi, td);
823 testPreceding(bi, td);
824 testIsBoundary(bi, td);
825 doMultipleSelectionTest(bi, td);
826}
827
828
829//
830// testFirstAndNext. Run the iterator forwards in the obvious first(), next()
831// kind of loop.
832//
833void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
834{
835 UErrorCode status = U_ZERO_ERROR;
836 int32_t p;
837 int32_t lastP = -1;
838 int32_t tag;
839
840 logln("Test first and next");
841 bi.setText(td.fDataToBreak);
842 td.clearResults();
843
844 for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
845 td.fActualBreakPositions.addElement(p, status); // Save result.
846 tag = bi.getRuleStatus();
847 td.fActualTags.addElement(tag, status);
848 if (p <= lastP) {
849 // If the iterator is not making forward progress, stop.
850 // No need to raise an error here, it'll be detected in the normal check of results.
851 break;
852 }
853 lastP = p;
854 }
855 td.checkResults("testFirstAndNext", this);
856}
857
858
859//
860// TestLastAndPrevious. Run the iterator backwards, starting with last().
861//
862void RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td)
863{
864 UErrorCode status = U_ZERO_ERROR;
865 int32_t p;
866 int32_t lastP = 0x7ffffffe;
867 int32_t tag;
868
869 logln("Test first and next");
870 bi.setText(td.fDataToBreak);
871 td.clearResults();
872
873 for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
874 // Save break position. Insert it at start of vector of results, shoving
875 // already-saved results further towards the end.
876 td.fActualBreakPositions.insertElementAt(p, 0, status);
877 // bi.previous(); // TODO: Why does this fix things up????
878 // bi.next();
879 tag = bi.getRuleStatus();
880 td.fActualTags.insertElementAt(tag, 0, status);
881 if (p >= lastP) {
882 // If the iterator is not making progress, stop.
883 // No need to raise an error here, it'll be detected in the normal check of results.
884 break;
885 }
886 lastP = p;
887 }
888 td.checkResults("testLastAndPrevious", this);
889}
890
891
892void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
893{
894 UErrorCode status = U_ZERO_ERROR;
895 int32_t p;
896 int32_t tag;
897 int32_t lastP = -2; // A value that will never be returned as a break position.
898 // cannot be -1; that is returned for DONE.
899 int i;
900
901 logln("testFollowing():");
902 bi.setText(td.fDataToBreak);
903 td.clearResults();
904
905 // Save the starting point, since we won't get that out of following.
906 p = bi.first();
907 td.fActualBreakPositions.addElement(p, status); // Save result.
908 tag = bi.getRuleStatus();
909 td.fActualTags.addElement(tag, status);
910
911 for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
912 p = bi.following(i);
913 if (p != lastP) {
914 if (p == RuleBasedBreakIterator::DONE) {
915 break;
916 }
917 // We've reached a new break position. Save it.
918 td.fActualBreakPositions.addElement(p, status); // Save result.
919 tag = bi.getRuleStatus();
920 td.fActualTags.addElement(tag, status);
921 lastP = p;
922 }
923 }
924 // The loop normally exits by means of the break in the middle.
925 // Make sure that the index was at the correct position for the break iterator to have
926 // returned DONE.
927 if (i != td.fDataToBreak.length()) {
928 errln("testFollowing(): iterator returned DONE prematurely.");
929 }
930
931 // Full check of all results.
932 td.checkResults("testFollowing", this);
933}
934
935
936
937void RBBITest::testPreceding(RuleBasedBreakIterator& bi, BITestData &td) {
938 UErrorCode status = U_ZERO_ERROR;
939 int32_t p;
940 int32_t tag;
941 int32_t lastP = 0x7ffffffe;
942 int i;
943
944 logln("testPreceding():");
945 bi.setText(td.fDataToBreak);
946 td.clearResults();
947
948 p = bi.last();
949 td.fActualBreakPositions.addElement(p, status);
950 tag = bi.getRuleStatus();
951 td.fActualTags.addElement(tag, status);
952
953 for (i = td.fDataToBreak.length(); i>=-1; i--) {
954 p = bi.preceding(i);
955 if (p != lastP) {
956 if (p == RuleBasedBreakIterator::DONE) {
957 break;
958 }
959 // We've reached a new break position. Save it.
960 td.fActualBreakPositions.insertElementAt(p, 0, status);
961 lastP = p;
962 tag = bi.getRuleStatus();
963 td.fActualTags.insertElementAt(tag, 0, status);
964 }
965 }
966 // The loop normally exits by means of the break in the middle.
967 // Make sure that the index was at the correct position for the break iterator to have
968 // returned DONE.
969 if (i != 0) {
970 errln("testPreceding(): iterator returned DONE prematurely.");
971 }
972
973 // Full check of all results.
974 td.checkResults("testPreceding", this);
975}
976
977
978
979void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td) {
980 UErrorCode status = U_ZERO_ERROR;
981 int i;
982 int32_t tag;
983
984 logln("testIsBoundary():");
985 bi.setText(td.fDataToBreak);
986 td.clearResults();
987
988 for (i = 0; i <= td.fDataToBreak.length(); i++) {
989 if (bi.isBoundary(i)) {
990 td.fActualBreakPositions.addElement(i, status); // Save result.
991 tag = bi.getRuleStatus();
992 td.fActualTags.addElement(tag, status);
993 }
994 }
995 td.checkResults("testIsBoundary: ", this);
996}
997
998
999
1000void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
1001{
1002 iterator.setText(td.fDataToBreak);
1003
1004 RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
1005 int32_t offset = iterator.first();
1006 int32_t testOffset;
1007 int32_t count = 0;
1008
1009 logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
1010
1011 if (*testIterator != iterator)
1012 errln("clone() or operator!= failed: two clones compared unequal");
1013
1014 do {
1015 testOffset = testIterator->first();
1016 testOffset = testIterator->next(count);
1017 if (offset != testOffset)
1018 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
1019
1020 if (offset != RuleBasedBreakIterator::DONE) {
1021 count++;
1022 offset = iterator.next();
1023
1024 if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
1025 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
1026 if (count > 10000 || offset == -1) {
1027 errln("operator== failed too many times. Stopping test.");
1028 if (offset == -1) {
1029 errln("Does (RuleBasedBreakIterator::DONE == -1)?");
1030 }
1031 return;
1032 }
1033 }
1034 }
1035 } while (offset != RuleBasedBreakIterator::DONE);
1036
1037 // now do it backwards...
1038 offset = iterator.last();
1039 count = 0;
1040
1041 do {
1042 testOffset = testIterator->last();
1043 testOffset = testIterator->next(count); // next() with a negative arg is same as previous
1044 if (offset != testOffset)
1045 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
1046
1047 if (offset != RuleBasedBreakIterator::DONE) {
1048 count--;
1049 offset = iterator.previous();
1050 }
1051 } while (offset != RuleBasedBreakIterator::DONE);
1052
1053 delete testIterator;
1054}
1055
1056
1057//---------------------------------------------
1058//
1059// other tests
1060//
1061//---------------------------------------------
1062void RBBITest::TestEmptyString()
1063{
1064 UnicodeString text = "";
1065 UErrorCode status = U_ZERO_ERROR;
1066
1067 BITestData x(status);
1068 ADD_DATACHUNK(x, "", 0, status); // Break at start of data
1069 RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
1070 if (U_FAILURE(status))
1071 {
1072 errln("Failed to create the BreakIterator for default locale in TestEmptyString.\n");
1073 return;
1074 }
1075 generalIteratorTest(*bi, x);
1076 delete bi;
1077}
1078
1079void RBBITest::TestGetAvailableLocales()
1080{
1081 int32_t locCount = 0;
1082 const Locale* locList = BreakIterator::getAvailableLocales(locCount);
1083
1084 if (locCount == 0)
1085 errln("getAvailableLocales() returned an empty list!");
1086 // Just make sure that it's returning good memory.
1087 int32_t i;
1088 for (i = 0; i < locCount; ++i) {
1089 logln(locList[i].getName());
1090 }
1091}
1092
1093//Testing the BreakIterator::getDisplayName() function
1094void RBBITest::TestGetDisplayName()
1095{
1096 UnicodeString result;
1097
1098 BreakIterator::getDisplayName(Locale::getUS(), result);
1099 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
1100 errln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
1101 + result);
1102
1103 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
1104 if (result != "French (France)")
1105 errln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
1106 + result);
1107}
1108/**
1109 * Test End Behaviour
1110 * @bug 4068137
1111 */
1112void RBBITest::TestEndBehaviour()
1113{
1114 UErrorCode status = U_ZERO_ERROR;
1115 UnicodeString testString("boo.");
1116 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
1117 if (U_FAILURE(status))
1118 {
1119 errln("Failed to create the BreakIterator for default locale in TestEndBehaviour.\n");
1120 return;
1121 }
1122 wb->setText(testString);
1123
1124 if (wb->first() != 0)
1125 errln("Didn't get break at beginning of string.");
1126 if (wb->next() != 3)
1127 errln("Didn't get break before period in \"boo.\"");
1128 if (wb->current() != 4 && wb->next() != 4)
1129 errln("Didn't get break at end of string.");
1130 delete wb;
1131}
1132/*
1133 * @bug 4153072
1134 */
1135void RBBITest::TestBug4153072() {
1136 UErrorCode status = U_ZERO_ERROR;
1137 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
1138 if (U_FAILURE(status))
1139 {
1140 errln("Failed to create the BreakIterator for default locale in TestBug4153072\n");
1141 return;
1142 }
1143 UnicodeString str("...Hello, World!...");
1144 int32_t begin = 3;
1145 int32_t end = str.length() - 3;
1146 UBool onBoundary;
1147
1148 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
1149 iter->adoptText(textIterator);
1150 int index;
1151 // Note: with the switch to UText, there is no way to restrict the
1152 // iteration range to begin at an index other than zero.
1153 // String character iterators created with a non-zero bound are
1154 // treated by RBBI as being empty.
1155 for (index = -1; index < begin + 1; ++index) {
1156 onBoundary = iter->isBoundary(index);
1157 if (index == 0? !onBoundary : onBoundary) {
1158 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
1159 " and begin index = " + begin);
1160 }
1161 }
1162 delete iter;
1163}
1164
1165
1166/**
1167 * Test Japanese Line Break
1168 * @bug 4095322
1169 */
1170void RBBITest::TestJapaneseLineBreak()
1171{
1172#if 0
1173 // Test needs updating some more... Dump it for now.
1174
1175
1176 // Change for Unicode TR 14: Punctuation characters with categories Pi and Pf do not count
1177 // as opening and closing punctuation for line breaking.
1178 // Also, \u30fc and \u30fe are not counted as hyphens. Remove these chars
1179 // from these tests. 6-13-2002
1180 //
1181 UErrorCode status = U_ZERO_ERROR;
1182 UnicodeString testString = CharsToUnicodeString("\\u4e00x\\u4e8c");
1183 UnicodeString precedingChars = CharsToUnicodeString(
1184 //"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f");
1185 "([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e");
1186 UnicodeString followingChars = CharsToUnicodeString(
1187 // ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc"
1188 ")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7"
1189 // ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034"
1190 ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u2034"
1191 "\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302");
1192 BreakIterator *iter = BreakIterator::createLineInstance(Locale::getJapan(), status);
1193
1194 int32_t i;
1195 if (U_FAILURE(status))
1196 {
1197 errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n");
1198 return;
1199 }
1200
1201 for (i = 0; i < precedingChars.length(); i++) {
1202 testString.setCharAt(1, precedingChars[i]);
1203 iter->setText(testString);
1204 int32_t j = iter->first();
1205 if (j != 0)
1206 errln("ja line break failure: failed to start at 0");
1207 j = iter->next();
1208 if (j != 1)
1209 errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars[i])
1210 + "' (" + ((int)(precedingChars[i])) + ")");
1211 j = iter->next();
1212 if (j != 3)
1213 errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars[i])
1214 + "' (" + ((int)(precedingChars[i])) + ")");
1215 }
1216
1217 for (i = 0; i < followingChars.length(); i++) {
1218 testString.setCharAt(1, followingChars[i]);
1219 iter->setText(testString);
1220 int j = iter->first();
1221 if (j != 0)
1222 errln("ja line break failure: failed to start at 0");
1223 j = iter->next();
1224 if (j != 2)
1225 errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars[i])
1226 + "' (" + ((int)(followingChars[i])) + ")");
1227 j = iter->next();
1228 if (j != 3)
1229 errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars[i])
1230 + "' (" + ((int)(followingChars[i])) + ")");
1231 }
1232 delete iter;
1233#endif
1234}
1235
1236
1237//------------------------------------------------------------------------------
1238//
1239// RBBITest::Extended Run RBBI Tests from an external test data file
1240//
1241//------------------------------------------------------------------------------
1242
1243struct TestParams {
1244 BreakIterator *bi;
1245 UnicodeString dataToBreak;
1246 UVector32 *expectedBreaks;
1247 UVector32 *srcLine;
1248 UVector32 *srcCol;
1249};
1250
1251void RBBITest::executeTest(TestParams *t) {
1252 int32_t bp;
1253 int32_t prevBP;
1254 int32_t i;
1255
1256 if (t->bi == NULL) {
1257 return;
1258 }
1259
1260 t->bi->setText(t->dataToBreak);
1261 //
1262 // Run the iterator forward
1263 //
1264 prevBP = -1;
1265 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
1266 if (prevBP == bp) {
1267 // Fail for lack of forward progress.
1268 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
1269 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1270 break;
1271 }
1272
1273 // Check that there were we didn't miss an expected break between the last one
1274 // and this one.
1275 for (i=prevBP+1; i<bp; i++) {
1276 if (t->expectedBreaks->elementAti(i) != 0) {
1277 int expected[] = {0, i};
1278 printStringBreaks(t->dataToBreak, expected, 2);
1279 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1280 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1281 }
1282 }
1283
1284 // Check that the break we did find was expected
1285 if (t->expectedBreaks->elementAti(bp) == 0) {
1286 int expected[] = {0, bp};
1287 printStringBreaks(t->dataToBreak, expected, 2);
1288 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
1289 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1290 } else {
1291 // The break was expected.
1292 // Check that the {nnn} tag value is correct.
1293 int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
1294 if (expectedTagVal == -1) {
1295 expectedTagVal = 0;
1296 }
1297 int32_t line = t->srcLine->elementAti(bp);
1298 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
1299 if (rs != expectedTagVal) {
1300 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
1301 " Actual, Expected status = %4d, %4d",
1302 bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
1303 }
1304 }
1305
1306
1307 prevBP = bp;
1308 }
1309
1310 // Verify that there were no missed expected breaks after the last one found
1311 for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
1312 if (t->expectedBreaks->elementAti(i) != 0) {
1313 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1314 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1315 }
1316 }
1317
1318 //
1319 // Run the iterator backwards, verify that the same breaks are found.
1320 //
1321 prevBP = t->dataToBreak.length()+2; // start with a phony value for the last break pos seen.
1322 for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
1323 if (prevBP == bp) {
1324 // Fail for lack of progress.
1325 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
1326 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1327 break;
1328 }
1329
1330 // Check that there were we didn't miss an expected break between the last one
1331 // and this one. (UVector returns zeros for index out of bounds.)
1332 for (i=prevBP-1; i>bp; i--) {
1333 if (t->expectedBreaks->elementAti(i) != 0) {
1334 errln("Reverse Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1335 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1336 }
1337 }
1338
1339 // Check that the break we did find was expected
1340 if (t->expectedBreaks->elementAti(bp) == 0) {
1341 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
1342 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1343 } else {
1344 // The break was expected.
1345 // Check that the {nnn} tag value is correct.
1346 int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
1347 if (expectedTagVal == -1) {
1348 expectedTagVal = 0;
1349 }
1350 int line = t->srcLine->elementAti(bp);
1351 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
1352 if (rs != expectedTagVal) {
1353 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
1354 " Actual, Expected status = %4d, %4d",
1355 bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
1356 }
1357 }
1358
1359 prevBP = bp;
1360 }
1361
1362 // Verify that there were no missed breaks prior to the last one found
1363 for (i=prevBP-1; i>=0; i--) {
1364 if (t->expectedBreaks->elementAti(i) != 0) {
1365 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1366 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1367 }
1368 }
1369}
1370
1371
1372void RBBITest::TestExtended() {
1373#if !UCONFIG_NO_REGULAR_EXPRESSIONS
1374 UErrorCode status = U_ZERO_ERROR;
1375 Locale locale("");
1376
1377 UnicodeString rules;
1378 TestParams tp;
1379 tp.bi = NULL;
1380 tp.expectedBreaks = new UVector32(status);
1381 tp.srcLine = new UVector32(status);
1382 tp.srcCol = new UVector32(status);
1383
1384 RegexMatcher localeMatcher("<locale *([\\p{L}\\p{Nd}_]*) *>", 0, status);
1385 TEST_ASSERT_SUCCESS(status);
1386
1387
1388 //
1389 // Open and read the test data file.
1390 //
1391 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1392 char testFileName[1000];
1393 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1394 errln("Can't open test data. Path too long.");
1395 return;
1396 }
1397 strcpy(testFileName, testDataDirectory);
1398 strcat(testFileName, "rbbitst.txt");
1399
1400 int len;
1401 UChar *testFile = ReadAndConvertFile(testFileName, len, status);
1402 if (U_FAILURE(status)) {
1403 return; /* something went wrong, error already output */
1404 }
1405
1406
1407
1408 //
1409 // Put the test data into a UnicodeString
1410 //
1411 UnicodeString testString(FALSE, testFile, len);
1412
1413 enum EParseState{
1414 PARSE_COMMENT,
1415 PARSE_TAG,
1416 PARSE_DATA,
1417 PARSE_NUM
1418 }
1419 parseState = PARSE_TAG;
1420
1421 EParseState savedState = PARSE_TAG;
1422
1423 static const UChar CH_LF = 0x0a;
1424 static const UChar CH_CR = 0x0d;
1425 static const UChar CH_HASH = 0x23;
1426 /*static const UChar CH_PERIOD = 0x2e;*/
1427 static const UChar CH_LT = 0x3c;
1428 static const UChar CH_GT = 0x3e;
1429 static const UChar CH_BACKSLASH = 0x5c;
1430 static const UChar CH_BULLET = 0x2022;
1431
1432 int32_t lineNum = 1;
1433 int32_t colStart = 0;
1434 int32_t column = 0;
1435 int32_t charIdx = 0;
1436
1437 int32_t tagValue = 0; // The numeric value of a <nnn> tag.
1438
1439 for (charIdx = 0; charIdx < len; ) {
1440 status = U_ZERO_ERROR;
1441 UChar c = testString.charAt(charIdx);
1442 charIdx++;
1443 if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
1444 // treat CRLF as a unit
1445 c = CH_LF;
1446 charIdx++;
1447 }
1448 if (c == CH_LF || c == CH_CR) {
1449 lineNum++;
1450 colStart = charIdx;
1451 }
1452 column = charIdx - colStart + 1;
1453
1454 switch (parseState) {
1455 case PARSE_COMMENT:
1456 if (c == 0x0a || c == 0x0d) {
1457 parseState = savedState;
1458 }
1459 break;
1460
1461 case PARSE_TAG:
1462 {
1463 if (c == CH_HASH) {
1464 parseState = PARSE_COMMENT;
1465 savedState = PARSE_TAG;
1466 break;
1467 }
1468 if (u_isUWhiteSpace(c)) {
1469 break;
1470 }
1471 if (testString.compare(charIdx-1, 6, "<word>") == 0) {
1472 delete tp.bi;
1473 tp.bi = BreakIterator::createWordInstance(locale, status);
1474 charIdx += 5;
1475 break;
1476 }
1477 if (testString.compare(charIdx-1, 6, "<char>") == 0) {
1478 delete tp.bi;
1479 tp.bi = BreakIterator::createCharacterInstance(locale, status);
1480 charIdx += 5;
1481 break;
1482 }
1483 if (testString.compare(charIdx-1, 6, "<line>") == 0) {
1484 delete tp.bi;
1485 tp.bi = BreakIterator::createLineInstance(locale, status);
1486 charIdx += 5;
1487 break;
1488 }
1489 if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
1490 delete tp.bi;
1491 tp.bi = NULL;
1492 tp.bi = BreakIterator::createSentenceInstance(locale, status);
1493 charIdx += 5;
1494 break;
1495 }
1496 if (testString.compare(charIdx-1, 7, "<title>") == 0) {
1497 delete tp.bi;
1498 tp.bi = BreakIterator::createTitleInstance(locale, status);
1499 charIdx += 6;
1500 break;
1501 }
1502 // <locale loc_name>
1503 localeMatcher.reset(testString);
1504 if (localeMatcher.lookingAt(charIdx-1, status)) {
1505 UnicodeString localeName = localeMatcher.group(1, status);
1506 char localeName8[100];
1507 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
1508 locale = Locale::createFromName(localeName8);
1509 charIdx += localeMatcher.group(0, status).length();
1510 TEST_ASSERT_SUCCESS(status);
1511 break;
1512 }
1513 if (testString.compare(charIdx-1, 6, "<data>") == 0) {
1514 parseState = PARSE_DATA;
1515 charIdx += 5;
1516 tp.dataToBreak = "";
1517 tp.expectedBreaks->removeAllElements();
1518 tp.srcCol ->removeAllElements();
1519 tp.srcLine->removeAllElements();
1520 break;
1521 }
1522
1523 errln("line %d: Tag expected in test file.", lineNum);
1524 goto end_test;
1525 parseState = PARSE_COMMENT;
1526 savedState = PARSE_DATA;
1527 }
1528 break;
1529
1530 case PARSE_DATA:
1531 if (c == CH_BULLET) {
1532 int32_t breakIdx = tp.dataToBreak.length();
1533 tp.expectedBreaks->setSize(breakIdx+1);
1534 tp.expectedBreaks->setElementAt(-1, breakIdx);
1535 tp.srcLine->setSize(breakIdx+1);
1536 tp.srcLine->setElementAt(lineNum, breakIdx);
1537 tp.srcCol ->setSize(breakIdx+1);
1538 tp.srcCol ->setElementAt(column, breakIdx);
1539 break;
1540 }
1541
1542 if (testString.compare(charIdx-1, 7, "</data>") == 0) {
1543 // Add final entry to mappings from break location to source file position.
1544 // Need one extra because last break position returned is after the
1545 // last char in the data, not at the last char.
1546 tp.srcLine->addElement(lineNum, status);
1547 tp.srcCol ->addElement(column, status);
1548
1549 parseState = PARSE_TAG;
1550 charIdx += 6;
1551
1552 // RUN THE TEST!
1553 executeTest(&tp);
1554 break;
1555 }
1556
1557 if (testString.compare(charIdx-1, 3, "\\N{") == 0) {
1558 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1559 // Get the code point from the name and insert it into the test data.
1560 // (Damn, no API takes names in Unicode !!!
1561 // we've got to take it back to char *)
1562 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
1563 int32_t nameLength = nameEndIdx - (charIdx+2);
1564 char charNameBuf[200];
1565 UChar32 theChar = -1;
1566 if (nameEndIdx != -1) {
1567 UErrorCode status = U_ZERO_ERROR;
1568 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1569 charNameBuf[sizeof(charNameBuf)-1] = 0;
1570 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1571 if (U_FAILURE(status)) {
1572 theChar = -1;
1573 }
1574 }
1575 if (theChar == -1) {
1576 errln("Error in named character in test file at line %d, col %d",
1577 lineNum, column);
1578 } else {
1579 // Named code point was recognized. Insert it
1580 // into the test data.
1581 tp.dataToBreak.append(theChar);
1582 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1583 tp.srcLine->addElement(lineNum, status);
1584 tp.srcCol ->addElement(column, status);
1585 }
1586 }
1587 if (nameEndIdx > charIdx) {
1588 charIdx = nameEndIdx+1;
1589
1590 }
1591 break;
1592 }
1593
1594
1595
1596
1597 if (testString.compare(charIdx-1, 2, "<>") == 0) {
1598 charIdx++;
1599 int32_t breakIdx = tp.dataToBreak.length();
1600 tp.expectedBreaks->setSize(breakIdx+1);
1601 tp.expectedBreaks->setElementAt(-1, breakIdx);
1602 tp.srcLine->setSize(breakIdx+1);
1603 tp.srcLine->setElementAt(lineNum, breakIdx);
1604 tp.srcCol ->setSize(breakIdx+1);
1605 tp.srcCol ->setElementAt(column, breakIdx);
1606 break;
1607 }
1608
1609 if (c == CH_LT) {
1610 tagValue = 0;
1611 parseState = PARSE_NUM;
1612 break;
1613 }
1614
1615 if (c == CH_HASH && column==3) { // TODO: why is column off so far?
1616 parseState = PARSE_COMMENT;
1617 savedState = PARSE_DATA;
1618 break;
1619 }
1620
1621 if (c == CH_BACKSLASH) {
1622 // Check for \ at end of line, a line continuation.
1623 // Advance over (discard) the newline
1624 UChar32 cp = testString.char32At(charIdx);
1625 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
1626 // We have a CR LF
1627 // Need an extra increment of the input ptr to move over both of them
1628 charIdx++;
1629 }
1630 if (cp == CH_LF || cp == CH_CR) {
1631 lineNum++;
1632 colStart = charIdx;
1633 charIdx++;
1634 break;
1635 }
1636
1637 // Let unescape handle the back slash.
1638 cp = testString.unescapeAt(charIdx);
1639 if (cp != -1) {
1640 // Escape sequence was recognized. Insert the char
1641 // into the test data.
1642 tp.dataToBreak.append(cp);
1643 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1644 tp.srcLine->addElement(lineNum, status);
1645 tp.srcCol ->addElement(column, status);
1646 }
1647 break;
1648 }
1649
1650
1651 // Not a recognized backslash escape sequence.
1652 // Take the next char as a literal.
1653 // TODO: Should this be an error?
1654 c = testString.charAt(charIdx);
1655 charIdx = testString.moveIndex32(charIdx, 1);
1656 }
1657
1658 // Normal, non-escaped data char.
1659 tp.dataToBreak.append(c);
1660
1661 // Save the mapping from offset in the data to line/column numbers in
1662 // the original input file. Will be used for better error messages only.
1663 // If there's an expected break before this char, the slot in the mapping
1664 // vector will already be set for this char; don't overwrite it.
1665 if (tp.dataToBreak.length() > tp.srcLine->size()) {
1666 tp.srcLine->addElement(lineNum, status);
1667 tp.srcCol ->addElement(column, status);
1668 }
1669 break;
1670
1671
1672 case PARSE_NUM:
1673 // We are parsing an expected numeric tag value, like <1234>,
1674 // within a chunk of data.
1675 if (u_isUWhiteSpace(c)) {
1676 break;
1677 }
1678
1679 if (c == CH_GT) {
1680 // Finished the number. Add the info to the expected break data,
1681 // and switch parse state back to doing plain data.
1682 parseState = PARSE_DATA;
1683 if (tagValue == 0) {
1684 tagValue = -1;
1685 }
1686 int32_t breakIdx = tp.dataToBreak.length();
1687 tp.expectedBreaks->setSize(breakIdx+1);
1688 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1689 tp.srcLine->setSize(breakIdx+1);
1690 tp.srcLine->setElementAt(lineNum, breakIdx);
1691 tp.srcCol ->setSize(breakIdx+1);
1692 tp.srcCol ->setElementAt(column, breakIdx);
1693 break;
1694 }
1695
1696 if (u_isdigit(c)) {
1697 tagValue = tagValue*10 + u_charDigitValue(c);
1698 break;
1699 }
1700
1701 errln("Syntax Error in test file at line %d, col %d",
1702 lineNum, column);
1703 goto end_test;
1704 parseState = PARSE_COMMENT;
1705 break;
1706 }
1707
1708
1709 if (U_FAILURE(status)) {
1710 errln("ICU Error %s while parsing test file at line %d.",
1711 u_errorName(status), lineNum);
1712 goto end_test;
1713 status = U_ZERO_ERROR;
1714 }
1715
1716 }
1717
1718end_test:
1719 delete tp.bi;
1720 delete tp.expectedBreaks;
1721 delete tp.srcLine;
1722 delete tp.srcCol;
1723 delete [] testFile;
1724#endif
1725}
1726
1727
1728//-------------------------------------------------------------------------------
1729//
1730// ReadAndConvertFile Read a text data file, convert it to UChars, and
1731// return the datain one big UChar * buffer, which the caller must delete.
1732//
1733// TODO: This is a clone of RegexTest::ReadAndConvertFile.
1734// Move this function to some common place.
1735//
1736//--------------------------------------------------------------------------------
1737UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, UErrorCode &status) {
1738 UChar *retPtr = NULL;
1739 char *fileBuf = NULL;
1740 UConverter* conv = NULL;
1741 FILE *f = NULL;
1742
1743 ulen = 0;
1744 if (U_FAILURE(status)) {
1745 return retPtr;
1746 }
1747
1748 //
1749 // Open the file.
1750 //
1751 f = fopen(fileName, "rb");
1752 if (f == 0) {
1753 errln("Error opening test data file %s\n", fileName);
1754 status = U_FILE_ACCESS_ERROR;
1755 return NULL;
1756 }
1757 //
1758 // Read it in
1759 //
1760 int fileSize;
1761 int amt_read;
1762
1763 fseek( f, 0, SEEK_END);
1764 fileSize = ftell(f);
1765 fileBuf = new char[fileSize];
1766 fseek(f, 0, SEEK_SET);
1767 amt_read = fread(fileBuf, 1, fileSize, f);
1768 if (amt_read != fileSize || fileSize <= 0) {
1769 errln("Error reading test data file.");
1770 goto cleanUpAndReturn;
1771 }
1772
1773 //
1774 // Look for a Unicode Signature (BOM) on the data just read
1775 //
1776 int32_t signatureLength;
1777 const char * fileBufC;
1778 const char* encoding;
1779
1780 fileBufC = fileBuf;
1781 encoding = ucnv_detectUnicodeSignature(
1782 fileBuf, fileSize, &signatureLength, &status);
1783 if(encoding!=NULL ){
1784 fileBufC += signatureLength;
1785 fileSize -= signatureLength;
1786 }
1787
1788 //
1789 // Open a converter to take the rule file to UTF-16
1790 //
1791 conv = ucnv_open(encoding, &status);
1792 if (U_FAILURE(status)) {
1793 goto cleanUpAndReturn;
1794 }
1795
1796 //
1797 // Convert the rules to UChar.
1798 // Preflight first to determine required buffer size.
1799 //
1800 ulen = ucnv_toUChars(conv,
1801 NULL, // dest,
1802 0, // destCapacity,
1803 fileBufC,
1804 fileSize,
1805 &status);
1806 if (status == U_BUFFER_OVERFLOW_ERROR) {
1807 // Buffer Overflow is expected from the preflight operation.
1808 status = U_ZERO_ERROR;
1809
1810 retPtr = new UChar[ulen+1];
1811 ucnv_toUChars(conv,
1812 retPtr, // dest,
1813 ulen+1,
1814 fileBufC,
1815 fileSize,
1816 &status);
1817 }
1818
1819cleanUpAndReturn:
1820 fclose(f);
1821 delete []fileBuf;
1822 ucnv_close(conv);
1823 if (U_FAILURE(status)) {
1824 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1825 delete retPtr;
1826 retPtr = 0;
1827 ulen = 0;
1828 };
1829 return retPtr;
1830}
1831
1832
1833//--------------------------------------------------------------------------------------------
1834//
1835// Exhaustive Tests, using Unicode Data Files.
1836//
1837//--------------------------------------------------------------------------------------------
1838
1839//
1840// Token level scanner for the Unicode Line Break Test Data file.
1841// Return the next token, as follows:
1842// >= 0: a UChar32 character, scanned from hex in the file.
1843// -1: a break position, a division sign in the file.
1844// -2: end of rule. A new line in the file.
1845// -3: end of file. No more rules.
1846// -4: Error
1847//
1848// The scanner
1849// strips comments, ('#' to end of line)
1850// Recognizes CR, CR/LF and LF as new lines.
1851// Skips over spaces and Xs (don't break here) in the data.
1852//
1853struct ScanState {
1854 int32_t fPeekChar;
1855 UBool fPeeked;
1856 int32_t fLineNum;
1857 FILE *fFile;
1858 ScanState() :fPeeked(FALSE), fLineNum(0), fFile(NULL) {};
1859};
1860
1861// Literal characters that are of interest. In hex to keep EBCDIC based machines happy.
1862// The data itself is latin-1 on all platforms.
1863static const int32_t chSpace = 0x20;
1864static const int32_t chTab = 0x09;
1865static const int32_t chCR = 0x0D;
1866static const int32_t chLF = 0x0A;
1867static const int32_t chHash = 0x23;
1868static const int32_t chMult = 0xD7;
1869static const int32_t chDivide = 0xF7;
1870
1871static int32_t nextLBDToken(ScanState *s) {
1872 int32_t c;
1873
1874 // Read characters from the input file until we get something interesting
1875 // to return. The file is in latin-1 encoding.
1876 for (;;) {
1877 // Get the next character to look at,
1878 if (s->fPeeked) {
1879 c = s->fPeekChar;
1880 s->fPeeked = FALSE;
1881 } else {
1882 c = getc(s->fFile);
1883 }
1884
1885 // EOF. Return immediately.
1886 if (c == EOF) {
1887 return -3;
1888 }
1889
1890 // Spaces. Treat the multiply sign as a space - it indicates a no-break position
1891 // in the data, and the test program doesn't want to see them.
1892 // Continue the next char loop, looking for something significant.
1893 if (c == chSpace || c == chTab || c == chMult) {
1894 continue;
1895 }
1896
1897 // Divide sign. Indicates an expected break position.
1898 if (c == chDivide) {
1899 return -1;
1900 }
1901
1902 // New Line Handling. Keep track of line number in the file, which in turn
1903 // requires keeping track of CR/LF as a single new line.
1904 if (c == chCR) {
1905 s->fLineNum++;
1906 s->fPeekChar = getc(s->fFile);
1907 if (s->fPeekChar != chLF) {s->fPeeked = TRUE;};
1908 return -2;
1909 }
1910 if (c == chLF) {
1911 s->fLineNum++;
1912 return -2;
1913 }
1914
1915 // Comments. Consume everything up to the next new line.
1916 if (c == chHash) {
1917 do {
1918 c = getc(s->fFile);
1919 } while (!(c == EOF || c == chCR || c == chLF));
1920 s->fPeekChar = c;
1921 s->fPeeked = TRUE;
1922 return nextLBDToken(s);
1923 }
1924
1925 // Scan a hex character (UChar32) value.
1926 if (u_digit(c, 16) >= 0) {
1927 int32_t v = u_digit(c, 16);
1928 for (;;) {
1929 c = getc(s->fFile);
1930 if (u_digit(c, 16) < 0) {break;};
1931 v <<= 4;
1932 v += u_digit(c, 16);
1933 }
1934 s->fPeekChar = c;
1935 s->fPeeked = TRUE;
1936 return v;
1937 }
1938
1939 // Error. Character was something unexpected.
1940 return -4;
1941 }
1942}
1943
1944
1945
1946void RBBITest::TestLineBreakData() {
1947
1948 UErrorCode status = U_ZERO_ERROR;
1949 UnicodeString testString;
1950 UVector expectedBreaks(status);
1951 ScanState ss;
1952 int32_t tok;
1953
1954 BreakIterator *bi = BreakIterator::createLineInstance(Locale::getDefault(), status);
1955 if (U_FAILURE(status)) {
1956 errln("Failure creating break iterator");
1957 return;
1958 }
1959
1960 const char * lbdfName = "LBTest.txt";
1961
1962 // Open the test data file.
1963 // TODO: a proper way to handle this data.
1964 ss.fFile = fopen(lbdfName, "rb");
1965 if (ss.fFile == NULL) {
1966 logln("Unable to open Line Break Test Data file. Skipping test.");
1967 delete bi;
1968 return;
1969 }
1970
1971 // Loop once per line from the test data file.
1972 for (;;) {
1973 // Zero out test data from previous line.
1974 testString.truncate(0);
1975 expectedBreaks.removeAllElements();
1976
1977 // Read one test's (line's) worth of data from the file.
1978 // Loop once per token on the input file line.
1979 for(;;) {
1980 tok = nextLBDToken(&ss);
1981
1982 // If we scanned a character number in the file.
1983 // save it in the test data array.
1984 if (tok >= 0) {
1985 testString.append((UChar32)tok);
1986 continue;
1987 }
1988
1989 // If we scanned a break position in the data, record it.
1990 if (tok == -1) {
1991 expectedBreaks.addElement(testString.length(), status);
1992 continue;
1993 }
1994
1995 // If we scanned a new line, or EOF
1996 // drop out of scan loop and run the test case.
1997 if (tok == -2 || tok == -3) {break;};
1998
1999 // None of above. Error.
2000 errln("Failure: Unrecognized data format, test file line %d", ss.fLineNum);
2001 break;
2002 }
2003
2004 // If this line from the test data file actually contained test data,
2005 // run the test.
2006 if (testString.length() > 0) {
2007 int32_t pos; // Break Position in the test string
2008 int32_t expectedI = 0; // Index of expected break position in vector of same.
2009 int32_t expectedPos; // Expected break position (index into test string)
2010
2011 bi->setText(testString);
2012 pos = bi->first();
2013 pos = bi->next();
2014
2015 for (; pos != BreakIterator::DONE; ) {
2016 expectedPos = expectedBreaks.elementAti(expectedI);
2017 if (pos < expectedPos) {
2018 errln("Failure: Test file line %d, unexpected break found at position %d",
2019 ss.fLineNum, pos);
2020 break;
2021 }
2022 if (pos > expectedPos) {
2023 errln("Failure: Test file line %d, failed to find break at position %d",
2024 ss.fLineNum, expectedPos);
2025 break;
2026 }
2027 pos = bi->next();
2028 expectedI++;
2029 }
2030 }
2031
2032 // If we've hit EOF on the input file, we're done.
2033 if (tok == -3) {
2034 break;
2035 }
2036
2037 }
2038
2039 fclose(ss.fFile);
2040 delete bi;
2041
2042}
2043
2044#if !UCONFIG_NO_REGULAR_EXPRESSIONS
2045
2046//---------------------------------------------------------------------------------------
2047//
2048// classs RBBIMonkeyKind
2049//
2050// Monkey Test for Break Iteration
2051// Abstract interface class. Concrete derived classes independently
2052// implement the break rules for different iterator types.
2053//
2054// The Monkey Test itself uses doesn't know which type of break iterator it is
2055// testing, but works purely in terms of the interface defined here.
2056//
2057//---------------------------------------------------------------------------------------
2058class RBBIMonkeyKind {
2059public:
2060 // Return a UVector of UnicodeSets, representing the character classes used
2061 // for this type of iterator.
2062 virtual UVector *charClasses() = 0;
2063
2064 // Set the test text on which subsequent calls to next() will operate
2065 virtual void setText(const UnicodeString &s) = 0;
2066
2067 // Find the next break postion, starting from the prev break position, or from zero.
2068 // Return -1 after reaching end of string.
2069 virtual int32_t next(int32_t i) = 0;
2070
2071 virtual ~RBBIMonkeyKind();
2072 UErrorCode deferredStatus;
2073
2074
2075protected:
2076 RBBIMonkeyKind();
2077
2078private:
2079};
2080
2081RBBIMonkeyKind::RBBIMonkeyKind() {
2082 deferredStatus = U_ZERO_ERROR;
2083}
2084
2085RBBIMonkeyKind::~RBBIMonkeyKind() {
2086}
2087
2088
2089//----------------------------------------------------------------------------------------
2090//
2091// Random Numbers. Similar to standard lib rand() and srand()
2092// Not using library to
2093// 1. Get same results on all platforms.
2094// 2. Get access to current seed, to more easily reproduce failures.
2095//
2096//---------------------------------------------------------------------------------------
2097static uint32_t m_seed = 1;
2098
2099static uint32_t m_rand()
2100{
2101 m_seed = m_seed * 1103515245 + 12345;
2102 return (uint32_t)(m_seed/65536) % 32768;
2103}
2104
2105
2106//------------------------------------------------------------------------------------------
2107//
2108// class RBBICharMonkey Character (Grapheme Cluster) specific implementation
2109// of RBBIMonkeyKind.
2110//
2111//------------------------------------------------------------------------------------------
2112class RBBICharMonkey: public RBBIMonkeyKind {
2113public:
2114 RBBICharMonkey();
2115 virtual ~RBBICharMonkey();
2116 virtual UVector *charClasses();
2117 virtual void setText(const UnicodeString &s);
2118 virtual int32_t next(int32_t i);
2119private:
2120 UVector *fSets;
2121
2122 UnicodeSet *fCRLFSet;
2123 UnicodeSet *fControlSet;
2124 UnicodeSet *fExtendSet;
2125 UnicodeSet *fHangulSet;
2126 UnicodeSet *fAnySet;
2127
2128 RegexMatcher *fMatcher;
2129 const UnicodeString *fText;
2130};
2131
2132
2133RBBICharMonkey::RBBICharMonkey() {
2134 UErrorCode status = U_ZERO_ERROR;
2135
2136 fText = NULL;
2137 fMatcher = new RegexMatcher("\\X", 0, status); // Pattern to match a grampheme cluster
2138
2139 fCRLFSet = new UnicodeSet("[\\r\\n]", status);
2140 fControlSet = new UnicodeSet("[[\\p{Zl}\\p{Zp}\\p{Cc}\\p{Cf}]-[\\n]-[\\r]-\\p{Grapheme_Extend}]", status);
2141 fExtendSet = new UnicodeSet("[\\p{Grapheme_Extend}]", status);
2142 fHangulSet = new UnicodeSet(
2143 "[\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=T}"
2144 "\\p{Hangul_Syllable_Type=LV}\\p{Hangul_Syllable_Type=LVT}]", status);
2145 fAnySet = new UnicodeSet("[\\u0000-\\U0010ffff]", status);
2146
2147 fSets = new UVector(status);
2148 fSets->addElement(fCRLFSet, status);
2149 fSets->addElement(fControlSet, status);
2150 fSets->addElement(fExtendSet, status);
2151 fSets->addElement(fHangulSet, status);
2152 fSets->addElement(fAnySet, status);
2153 if (U_FAILURE(status)) {
2154 deferredStatus = status;
2155 }
2156}
2157
2158
2159void RBBICharMonkey::setText(const UnicodeString &s) {
2160 fText = &s;
2161 fMatcher->reset(s);
2162}
2163
2164
2165int32_t RBBICharMonkey::next(int32_t i) {
2166 UErrorCode status = U_ZERO_ERROR;
2167 int32_t retVal = -1;
2168
2169 if (fMatcher->find(i, status)) {
2170 retVal = fMatcher->end(status);
2171 }
2172 if (U_FAILURE(status)){
2173 retVal = -1;
2174 }
2175 return retVal;
2176}
2177
2178
2179UVector *RBBICharMonkey::charClasses() {
2180 return fSets;
2181}
2182
2183
2184RBBICharMonkey::~RBBICharMonkey() {
2185 delete fSets;
2186 delete fCRLFSet;
2187 delete fControlSet;
2188 delete fExtendSet;
2189 delete fHangulSet;
2190 delete fAnySet;
2191
2192 delete fMatcher;
2193}
2194
2195//------------------------------------------------------------------------------------------
2196//
2197// class RBBIWordMonkey Word Break specific implementation
2198// of RBBIMonkeyKind.
2199//
2200//------------------------------------------------------------------------------------------
2201class RBBIWordMonkey: public RBBIMonkeyKind {
2202public:
2203 RBBIWordMonkey();
2204 virtual ~RBBIWordMonkey();
2205 virtual UVector *charClasses();
2206 virtual void setText(const UnicodeString &s);
2207 virtual int32_t next(int32_t i);
2208private:
2209 UVector *fSets;
2210
2211 UnicodeSet *fKatakanaSet;
2212 UnicodeSet *fALetterSet;
2213 UnicodeSet *fMidLetterSet;
2214 UnicodeSet *fMidNumSet;
2215 UnicodeSet *fNumericSet;
2216 UnicodeSet *fFormatSet;
2217 UnicodeSet *fOtherSet;
2218 UnicodeSet *fExtendSet;
2219 UnicodeSet *fExtendNumLetSet;
2220
2221 RegexMatcher *fMatcher;
2222
2223 const UnicodeString *fText;
2224};
2225
2226
2227RBBIWordMonkey::RBBIWordMonkey()
2228{
2229 UErrorCode status = U_ZERO_ERROR;
2230
2231
2232 fSets = new UVector(status);
2233
2234 fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}"
2235 "[\\p{Line_Break = Complex_Context}"
2236 "-\\p{Grapheme_Cluster_Break = Extend}"
2237 "-\\p{Grapheme_Cluster_Break = Control}]]", status);
2238 //fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}]", status);
2239 fKatakanaSet = new UnicodeSet("[\\p{Word_Break = Katakana}-[\\uff9e\\uff9f]]", status);
2240 fMidLetterSet = new UnicodeSet("[\\p{Word_Break = MidLetter}]", status);
2241 fMidNumSet = new UnicodeSet("[\\p{Word_Break = MidNum}]", status);
2242 fNumericSet = new UnicodeSet("[\\p{Word_Break = Numeric}]", status);
2243 fFormatSet = new UnicodeSet("[\\p{Word_Break = Format}]", status);
2244 fExtendNumLetSet = new UnicodeSet("[\\p{Word_Break = ExtendNumLet}]", status);
2245 //fExtendSet = new UnicodeSet("[\\p{Word_Break = Extend}]", status);
2246 fExtendSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Extend}\\uff9e\\uff9f]", status);
2247
2248 fOtherSet = new UnicodeSet();
2249 if(U_FAILURE(status)) {
2250 deferredStatus = status;
2251 return;
2252 }
2253
2254 fOtherSet->complement();
2255 fOtherSet->removeAll(*fKatakanaSet);
2256 fOtherSet->removeAll(*fALetterSet);
2257 fOtherSet->removeAll(*fMidLetterSet);
2258 fOtherSet->removeAll(*fMidNumSet);
2259 fOtherSet->removeAll(*fNumericSet);
2260 fOtherSet->removeAll(*fExtendNumLetSet);
2261 fOtherSet->removeAll(*fFormatSet);
2262 fOtherSet->removeAll(*fExtendSet);
2263
2264 fSets->addElement(fALetterSet, status);
2265 fSets->addElement(fKatakanaSet, status);
2266 fSets->addElement(fMidLetterSet, status);
2267 fSets->addElement(fMidNumSet, status);
2268 fSets->addElement(fNumericSet, status);
2269 fSets->addElement(fFormatSet, status);
2270 fSets->addElement(fExtendSet, status);
2271 fSets->addElement(fOtherSet, status);
2272 fSets->addElement(fExtendNumLetSet, status);
2273
2274
2275 if (U_FAILURE(status)) {
2276 deferredStatus = status;
2277 }
2278}
2279
2280void RBBIWordMonkey::setText(const UnicodeString &s) {
2281 fText = &s;
2282}
2283
2284
2285int32_t RBBIWordMonkey::next(int32_t prevPos) {
2286 int p0, p1, p2, p3; // Indices of the significant code points around the
2287 // break position being tested. The candidate break
2288 // location is before p2.
2289
2290 int breakPos = -1;
2291
2292 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2293
2294 // Prev break at end of string. return DONE.
2295 if (prevPos >= fText->length()) {
2296 return -1;
2297 }
2298 p0 = p1 = p2 = p3 = prevPos;
2299 c3 = fText->char32At(prevPos);
2300 c0 = c1 = c2 = 0;
2301
2302 // Loop runs once per "significant" character position in the input text.
2303 for (;;) {
2304 // Move all of the positions forward in the input string.
2305 p0 = p1; c0 = c1;
2306 p1 = p2; c1 = c2;
2307 p2 = p3; c2 = c3;
2308
2309 // Advancd p3 by X(Extend | Format)* Rule 4
2310 do {
2311 p3 = fText->moveIndex32(p3, 1);
2312 c3 = fText->char32At(p3);
2313 }
2314 while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
2315
2316
2317 if (p1 == p2) {
2318 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2319 continue;
2320 }
2321 if (p2 == fText->length()) {
2322 // Reached end of string. Always a break position.
2323 break;
2324 }
2325
2326 // Rule (3) CR x LF
2327 // No Extend or Format characters may appear between the CR and LF,
2328 // which requires the additional check for p2 immediately following p1.
2329 //
2330 if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
2331 continue;
2332 }
2333
2334 // Rule (5). ALetter x ALetter
2335 if (fALetterSet->contains(c1) &&
2336 fALetterSet->contains(c2)) {
2337 continue;
2338 }
2339
2340 // Rule (6) ALetter x (MidLetter | MidNumLet) ALetter
2341 //
2342 // Also incorporates rule 7 by skipping pos ahead to position of the
2343 // terminating ALetter.
2344 if ( fALetterSet->contains(c1) &&
2345 fMidLetterSet->contains(c2) &&
2346 fALetterSet->contains(c3)) {
2347 continue;
2348 }
2349
2350
2351 // Rule (7) ALetter (MidLetter | MidNumLet) x ALetter
2352 if (fALetterSet->contains(c0) &&
2353 (fMidLetterSet->contains(c1) ) &&
2354 fALetterSet->contains(c2)) {
2355 continue;
2356 }
2357
2358 // Rule (8) Numeric x Numeric
2359 if (fNumericSet->contains(c1) &&
2360 fNumericSet->contains(c2)) {
2361 continue;
2362 }
2363
2364 // Rule (9) ALetter x Numeric
2365 if (fALetterSet->contains(c1) &&
2366 fNumericSet->contains(c2)) {
2367 continue;
2368 }
2369
2370 // Rule (10) Numeric x ALetter
2371 if (fNumericSet->contains(c1) &&
2372 fALetterSet->contains(c2)) {
2373 continue;
2374 }
2375
2376 // Rule (11) Numeric (MidNum | MidNumLet) x Numeric
2377 if ( fNumericSet->contains(c0) &&
2378 fMidNumSet->contains(c1) &&
2379 fNumericSet->contains(c2)) {
2380 continue;
2381 }
2382
2383 // Rule (12) Numeric x (MidNum | MidNumLet) Numeric
2384 if (fNumericSet->contains(c1) &&
2385 fMidNumSet->contains(c2) &&
2386 fNumericSet->contains(c3)) {
2387 continue;
2388 }
2389
2390 // Rule (13) Katakana x Katakana
2391 if (fKatakanaSet->contains(c1) &&
2392 fKatakanaSet->contains(c2)) {
2393 continue;
2394 }
2395
2396 // Rule 13a
2397 if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) ||
2398 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2399 fExtendNumLetSet->contains(c2)) {
2400 continue;
2401 }
2402
2403 // Rule 13b
2404 if (fExtendNumLetSet->contains(c1) &&
2405 (fALetterSet->contains(c2) || fNumericSet->contains(c2) ||
2406 fKatakanaSet->contains(c2))) {
2407 continue;
2408 }
2409
2410 // Rule 14. Break found here.
2411 break;
2412 }
2413
2414 breakPos = p2;
2415 return breakPos;
2416}
2417
2418
2419UVector *RBBIWordMonkey::charClasses() {
2420 return fSets;
2421}
2422
2423
2424RBBIWordMonkey::~RBBIWordMonkey() {
2425 delete fSets;
2426 delete fKatakanaSet;
2427 delete fALetterSet;
2428 delete fMidLetterSet;
2429 delete fMidNumSet;
2430 delete fNumericSet;
2431 delete fFormatSet;
2432 delete fExtendSet;
2433 delete fExtendNumLetSet;
2434 delete fOtherSet;
2435}
2436
2437
2438
2439
2440//------------------------------------------------------------------------------------------
2441//
2442// class RBBISentMonkey Sentence Break specific implementation
2443// of RBBIMonkeyKind.
2444//
2445//------------------------------------------------------------------------------------------
2446class RBBISentMonkey: public RBBIMonkeyKind {
2447public:
2448 RBBISentMonkey();
2449 virtual ~RBBISentMonkey();
2450 virtual UVector *charClasses();
2451 virtual void setText(const UnicodeString &s);
2452 virtual int32_t next(int32_t i);
2453private:
2454 int moveBack(int posFrom);
2455 int moveForward(int posFrom);
2456 UChar32 cAt(int pos);
2457
2458 UVector *fSets;
2459
2460 UnicodeSet *fSepSet;
2461 UnicodeSet *fFormatSet;
2462 UnicodeSet *fSpSet;
2463 UnicodeSet *fLowerSet;
2464 UnicodeSet *fUpperSet;
2465 UnicodeSet *fOLetterSet;
2466 UnicodeSet *fNumericSet;
2467 UnicodeSet *fATermSet;
2468 UnicodeSet *fSTermSet;
2469 UnicodeSet *fCloseSet;
2470 UnicodeSet *fOtherSet;
2471 UnicodeSet *fExtendSet;
2472
2473 const UnicodeString *fText;
2474
2475};
2476
2477RBBISentMonkey::RBBISentMonkey()
2478{
2479 UErrorCode status = U_ZERO_ERROR;
2480
2481 fSets = new UVector(status);
2482
2483 fSepSet = new UnicodeSet("[\\p{Sentence_Break = Sep}]", status);
2484 fFormatSet = new UnicodeSet("[\\p{Sentence_Break = Format}]", status);
2485 fSpSet = new UnicodeSet("[\\p{Sentence_Break = Sp}]", status);
2486 fLowerSet = new UnicodeSet("[\\p{Sentence_Break = Lower}]", status);
2487 fUpperSet = new UnicodeSet("[\\p{Sentence_Break = Upper}]", status);
2488 fOLetterSet = new UnicodeSet("[\\p{Sentence_Break = OLetter}-[\\uff9e\\uff9f]]", status);
2489 fNumericSet = new UnicodeSet("[\\p{Sentence_Break = Numeric}]", status);
2490 fATermSet = new UnicodeSet("[\\p{Sentence_Break = ATerm}]", status);
2491 fSTermSet = new UnicodeSet("[\\p{Sentence_Break = STerm}]", status);
2492 fCloseSet = new UnicodeSet("[\\p{Sentence_Break = Close}]", status);
2493 fExtendSet = new UnicodeSet("[\\p{Grapheme_Extend}\\uff9e\\uff9f]", status);
2494 fOtherSet = new UnicodeSet();
2495
2496 if(U_FAILURE(status)) {
2497 deferredStatus = status;
2498 return;
2499 }
2500
2501 fOtherSet->complement();
2502 fOtherSet->removeAll(*fSepSet);
2503 fOtherSet->removeAll(*fFormatSet);
2504 fOtherSet->removeAll(*fSpSet);
2505 fOtherSet->removeAll(*fLowerSet);
2506 fOtherSet->removeAll(*fUpperSet);
2507 fOtherSet->removeAll(*fOLetterSet);
2508 fOtherSet->removeAll(*fNumericSet);
2509 fOtherSet->removeAll(*fATermSet);
2510 fOtherSet->removeAll(*fSTermSet);
2511 fOtherSet->removeAll(*fCloseSet);
2512 fOtherSet->removeAll(*fExtendSet);
2513
2514 fSets->addElement(fSepSet, status);
2515 fSets->addElement(fFormatSet, status);
2516
2517 fSets->addElement(fSpSet, status);
2518 fSets->addElement(fLowerSet, status);
2519 fSets->addElement(fUpperSet, status);
2520 fSets->addElement(fOLetterSet, status);
2521 fSets->addElement(fNumericSet, status);
2522 fSets->addElement(fATermSet, status);
2523 fSets->addElement(fSTermSet, status);
2524 fSets->addElement(fCloseSet, status);
2525 fSets->addElement(fOtherSet, status);
2526 fSets->addElement(fExtendSet, status);
2527
2528 if (U_FAILURE(status)) {
2529 deferredStatus = status;
2530 }
2531}
2532
2533
2534
2535void RBBISentMonkey::setText(const UnicodeString &s) {
2536 fText = &s;
2537}
2538
2539UVector *RBBISentMonkey::charClasses() {
2540 return fSets;
2541}
2542
2543
2544// moveBack() Find the "significant" code point preceding the index i.
2545// Skips over ($Extend | $Format)* .
2546//
2547int RBBISentMonkey::moveBack(int i) {
2548 if (i <= 0) {
2549 return -1;
2550 }
2551 UChar32 c;
2552 int32_t j = i;
2553 do {
2554 j = fText->moveIndex32(j, -1);
2555 c = fText->char32At(j);
2556 }
2557 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2558 return j;
2559
2560 }
2561
2562
2563int RBBISentMonkey::moveForward(int i) {
2564 if (i>=fText->length()) {
2565 return fText->length();
2566 }
2567 UChar32 c;
2568 int32_t j = i;
2569 do {
2570 j = fText->moveIndex32(j, 1);
2571 c = cAt(j);
2572 }
2573 while (fFormatSet->contains(c) || fExtendSet->contains(c));
2574 return j;
2575}
2576
2577UChar32 RBBISentMonkey::cAt(int pos) {
2578 if (pos<0 || pos>=fText->length()) {
2579 return -1;
2580 } else {
2581 return fText->char32At(pos);
2582 }
2583}
2584
2585int32_t RBBISentMonkey::next(int32_t prevPos) {
2586 int p0, p1, p2, p3; // Indices of the significant code points around the
2587 // break position being tested. The candidate break
2588 // location is before p2.
2589
2590 int breakPos = -1;
2591
2592 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2593 UChar32 c;
2594
2595 // Prev break at end of string. return DONE.
2596 if (prevPos >= fText->length()) {
2597 return -1;
2598 }
2599 p0 = p1 = p2 = p3 = prevPos;
2600 c3 = fText->char32At(prevPos);
2601 c0 = c1 = c2 = 0;
2602
2603 // Loop runs once per "significant" character position in the input text.
2604 for (;;) {
2605 // Move all of the positions forward in the input string.
2606 p0 = p1; c0 = c1;
2607 p1 = p2; c1 = c2;
2608 p2 = p3; c2 = c3;
2609
2610 // Advancd p3 by X(Extend | Format)* Rule 4
2611 p3 = moveForward(p3);
2612 c3 = cAt(p3);
2613
2614 // Rule (3) CR x LF
2615 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2616 continue;
2617 }
2618
2619 // Rule (4). Sep <break>
2620 if (fSepSet->contains(c1)) {
2621 p2 = p1+1; // Separators don't combine with Extend or Format.
2622 break;
2623 }
2624
2625 if (p2 >= fText->length()) {
2626 // Reached end of string. Always a break position.
2627 break;
2628 }
2629
2630 if (p2 == prevPos) {
2631 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2632 continue;
2633 }
2634
2635 // Rule (6). ATerm x Numeric
2636 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) {
2637 continue;
2638 }
2639
2640 // Rule (7). Upper ATerm x Uppper
2641 if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2642 continue;
2643 }
2644
2645 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2646 // Note: STerm | ATerm are added to the negated part of the expression by a
2647 // note to the Unicode 5.0 documents.
2648 int p8 = p1;
2649 while (fSpSet->contains(cAt(p8))) {
2650 p8 = moveBack(p8);
2651 }
2652 while (fCloseSet->contains(cAt(p8))) {
2653 p8 = moveBack(p8);
2654 }
2655 if (fATermSet->contains(cAt(p8))) {
2656 p8=p2;
2657 for (;;) {
2658 c = cAt(p8);
2659 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2660 fLowerSet->contains(c) || fSepSet->contains(c) ||
2661 fATermSet->contains(c) || fSTermSet->contains(c)) {
2662 break;
2663 }
2664 p8 = moveForward(p8);
2665 }
2666 if (fLowerSet->contains(cAt(p8))) {
2667 continue;
2668 }
2669 }
2670
2671 // Rule 8a (STerm | ATerm) Close* Sp* x (STerm | ATerm);
2672 if (fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2673 p8 = p1;
2674 while (fSpSet->contains(cAt(p8))) {
2675 p8 = moveBack(p8);
2676 }
2677 while (fCloseSet->contains(cAt(p8))) {
2678 p8 = moveBack(p8);
2679 }
2680 c = cAt(p8);
2681 if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2682 continue;
2683 }
2684 }
2685
2686 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep)
2687 int p9 = p1;
2688 while (fCloseSet->contains(cAt(p9))) {
2689 p9 = moveBack(p9);
2690 }
2691 c = cAt(p9);
2692 if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2693 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2694 continue;
2695 }
2696 }
2697
2698 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep)
2699 int p10 = p1;
2700 while (fSpSet->contains(cAt(p10))) {
2701 p10 = moveBack(p10);
2702 }
2703 while (fCloseSet->contains(cAt(p10))) {
2704 p10 = moveBack(p10);
2705 }
2706 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2707 if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2708 continue;
2709 }
2710 }
2711
2712 // Rule (11) (STerm | ATerm) Close* Sp* <break>
2713 int p11 = p1;
2714 while (fSpSet->contains(cAt(p11))) {
2715 p11 = moveBack(p11);
2716 }
2717 while (fCloseSet->contains(cAt(p11))) {
2718 p11 = moveBack(p11);
2719 }
2720 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2721 break;
2722 }
2723
2724 // Rule (12) Any x Any
2725 continue;
2726 }
2727 breakPos = p2;
2728 return breakPos;
2729}
2730
2731RBBISentMonkey::~RBBISentMonkey() {
2732 delete fSets;
2733 delete fSepSet;
2734 delete fFormatSet;
2735 delete fSpSet;
2736 delete fLowerSet;
2737 delete fUpperSet;
2738 delete fOLetterSet;
2739 delete fNumericSet;
2740 delete fATermSet;
2741 delete fSTermSet;
2742 delete fCloseSet;
2743 delete fOtherSet;
2744 delete fExtendSet;
2745}
2746
2747
2748
2749//-------------------------------------------------------------------------------------------
2750//
2751// RBBILineMonkey
2752//
2753//-------------------------------------------------------------------------------------------
2754
2755class RBBILineMonkey: public RBBIMonkeyKind {
2756public:
2757 RBBILineMonkey();
2758 virtual ~RBBILineMonkey();
2759 virtual UVector *charClasses();
2760 virtual void setText(const UnicodeString &s);
2761 virtual int32_t next(int32_t i);
2762 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2763private:
2764 UVector *fSets;
2765
2766 UnicodeSet *fBK;
2767 UnicodeSet *fCR;
2768 UnicodeSet *fLF;
2769 UnicodeSet *fCM;
2770 UnicodeSet *fNL;
2771 UnicodeSet *fSG;
2772 UnicodeSet *fWJ;
2773 UnicodeSet *fZW;
2774 UnicodeSet *fGL;
2775 UnicodeSet *fCB;
2776 UnicodeSet *fSP;
2777 UnicodeSet *fB2;
2778 UnicodeSet *fBA;
2779 UnicodeSet *fBB;
2780 UnicodeSet *fHY;
2781 UnicodeSet *fH2;
2782 UnicodeSet *fH3;
2783 UnicodeSet *fCL;
2784 UnicodeSet *fEX;
2785 UnicodeSet *fIN;
2786 UnicodeSet *fJL;
2787 UnicodeSet *fJV;
2788 UnicodeSet *fJT;
2789 UnicodeSet *fNS;
2790 UnicodeSet *fOP;
2791 UnicodeSet *fQU;
2792 UnicodeSet *fIS;
2793 UnicodeSet *fNU;
2794 UnicodeSet *fPO;
2795 UnicodeSet *fPR;
2796 UnicodeSet *fSY;
2797 UnicodeSet *fAI;
2798 UnicodeSet *fAL;
2799 UnicodeSet *fID;
2800 UnicodeSet *fSA;
2801 UnicodeSet *fXX;
2802
2803 BreakIterator *fCharBI;
2804
2805 const UnicodeString *fText;
2806 int32_t *fOrigPositions;
2807
2808 RegexMatcher *fNumberMatcher;
2809 RegexMatcher *fLB11Matcher;
2810};
2811
2812
2813RBBILineMonkey::RBBILineMonkey()
2814{
2815 UErrorCode status = U_ZERO_ERROR;
2816
2817 fSets = new UVector(status);
2818
2819 fBK = new UnicodeSet("[\\p{Line_Break=BK}]", status);
2820 fCR = new UnicodeSet("[\\p{Line_break=CR}]", status);
2821 fLF = new UnicodeSet("[\\p{Line_break=LF}]", status);
2822 fCM = new UnicodeSet("[\\p{Line_break=CM}]", status);
2823 fNL = new UnicodeSet("[\\p{Line_break=NL}]", status);
2824 fWJ = new UnicodeSet("[\\p{Line_break=WJ}]", status);
2825 fZW = new UnicodeSet("[\\p{Line_break=ZW}]", status);
2826 fGL = new UnicodeSet("[\\p{Line_break=GL}]", status);
2827 fCB = new UnicodeSet("[\\p{Line_break=CB}]", status);
2828 fSP = new UnicodeSet("[\\p{Line_break=SP}]", status);
2829 fB2 = new UnicodeSet("[\\p{Line_break=B2}]", status);
2830 fBA = new UnicodeSet("[\\p{Line_break=BA}]", status);
2831 fBB = new UnicodeSet("[\\p{Line_break=BB}]", status);
2832 fHY = new UnicodeSet("[\\p{Line_break=HY}]", status);
2833 fH2 = new UnicodeSet("[\\p{Line_break=H2}]", status);
2834 fH3 = new UnicodeSet("[\\p{Line_break=H3}]", status);
2835 fCL = new UnicodeSet("[\\p{Line_break=CL}]", status);
2836 fEX = new UnicodeSet("[\\p{Line_break=EX}]", status);
2837 fIN = new UnicodeSet("[\\p{Line_break=IN}]", status);
2838 fJL = new UnicodeSet("[\\p{Line_break=JL}]", status);
2839 fJV = new UnicodeSet("[\\p{Line_break=JV}]", status);
2840 fJT = new UnicodeSet("[\\p{Line_break=JT}]", status);
2841 fNS = new UnicodeSet("[\\p{Line_break=NS}]", status);
2842 fOP = new UnicodeSet("[\\p{Line_break=OP}]", status);
2843 fQU = new UnicodeSet("[\\p{Line_break=QU}]", status);
2844 fIS = new UnicodeSet("[\\p{Line_break=IS}]", status);
2845 fNU = new UnicodeSet("[\\p{Line_break=NU}]", status);
2846 fPO = new UnicodeSet("[\\p{Line_break=PO}]", status);
2847 fPR = new UnicodeSet("[\\p{Line_break=PR}]", status);
2848 fSY = new UnicodeSet("[\\p{Line_break=SY}]", status);
2849 fAI = new UnicodeSet("[\\p{Line_break=AI}]", status);
2850 fAL = new UnicodeSet("[\\p{Line_break=AL}]", status);
2851 fID = new UnicodeSet("[\\p{Line_break=ID}]", status);
2852 fSA = new UnicodeSet("[\\p{Line_break=SA}]", status);
2853 fSG = new UnicodeSet("[\\ud800-\\udfff]", status);
2854 fXX = new UnicodeSet("[\\p{Line_break=XX}]", status);
2855
2856 if (U_FAILURE(status)) {
2857 deferredStatus = status;
2858 fCharBI = NULL;
2859 fNumberMatcher = NULL;
2860 return;
2861 }
2862
2863 fAL->addAll(*fXX); // Default behavior for XX is identical to AL
2864 fAL->addAll(*fAI); // Default behavior for AI is identical to AL
2865 fAL->addAll(*fSA); // Default behavior for SA is XX, which defaults to AL
2866 fAL->addAll(*fSG); // Default behavior for SG is identical to AL.
2867
2868 fSets->addElement(fBK, status);
2869 fSets->addElement(fCR, status);
2870 fSets->addElement(fLF, status);
2871 fSets->addElement(fCM, status);
2872 fSets->addElement(fNL, status);
2873 fSets->addElement(fWJ, status);
2874 fSets->addElement(fZW, status);
2875 fSets->addElement(fGL, status);
2876 fSets->addElement(fCB, status);
2877 fSets->addElement(fSP, status);
2878 fSets->addElement(fB2, status);
2879 fSets->addElement(fBA, status);
2880 fSets->addElement(fBB, status);
2881 fSets->addElement(fHY, status);
2882 fSets->addElement(fH2, status);
2883 fSets->addElement(fH3, status);
2884 fSets->addElement(fCL, status);
2885 fSets->addElement(fEX, status);
2886 fSets->addElement(fIN, status);
2887 fSets->addElement(fJL, status);
2888 fSets->addElement(fJT, status);
2889 fSets->addElement(fJV, status);
2890 fSets->addElement(fNS, status);
2891 fSets->addElement(fOP, status);
2892 fSets->addElement(fQU, status);
2893 fSets->addElement(fIS, status);
2894 fSets->addElement(fNU, status);
2895 fSets->addElement(fPO, status);
2896 fSets->addElement(fPR, status);
2897 fSets->addElement(fSY, status);
2898 fSets->addElement(fAI, status);
2899 fSets->addElement(fAL, status);
2900 fSets->addElement(fID, status);
2901 fSets->addElement(fWJ, status);
2902 fSets->addElement(fSA, status);
2903 fSets->addElement(fSG, status);
2904
2905 fNumberMatcher = new RegexMatcher(
2906 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
2907 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
2908 "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
2909 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
2910 "(\\p{Line_Break=CL}\\p{Line_Break=CM}*)?"
2911 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?",
2912 0, status);
2913
2914 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2915
2916 if (U_FAILURE(status)) {
2917 deferredStatus = status;
2918 }
2919}
2920
2921
2922void RBBILineMonkey::setText(const UnicodeString &s) {
2923 fText = &s;
2924 fCharBI->setText(s);
2925 fNumberMatcher->reset(s);
2926}
2927
2928//
2929// rule9Adjust
2930// Line Break TR rules 9 and 10 implementation.
2931// This deals with combining marks and other sequences that
2932// that must be treated as if they were something other than what they actually are.
2933//
2934// This is factored out into a separate function because it must be applied twice for
2935// each potential break, once to the chars before the position being checked, then
2936// again to the text following the possible break.
2937//
2938void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2939 if (pos == -1) {
2940 // Invalid initial position. Happens during the warmup iteration of the
2941 // main loop in next().
2942 return;
2943 }
2944
2945 int32_t nPos = *nextPos;
2946
2947 // LB 9 Keep combining sequences together.
2948 // advance over any CM class chars. Note that Line Break CM is different
2949 // from the normal Grapheme Extend property.
2950 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
2951 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
2952 for (;;) {
2953 *nextChar = fText->char32At(nPos);
2954 if (!fCM->contains(*nextChar)) {
2955 break;
2956 }
2957 nPos = fText->moveIndex32(nPos, 1);
2958 }
2959 }
2960
2961
2962 // LB 9 Treat X CM* as if it were x.
2963 // No explicit action required.
2964
2965 // LB 10 Treat any remaining combining mark as AL
2966 if (fCM->contains(*posChar)) {
2967 *posChar = 0x41; // thisChar = 'A';
2968 }
2969
2970 // Push the updated nextPos and nextChar back to our caller.
2971 // This only makes a difference if posChar got bigger by consuming a
2972 // combining sequence.
2973 *nextPos = nPos;
2974 *nextChar = fText->char32At(nPos);
2975}
2976
2977
2978
2979int32_t RBBILineMonkey::next(int32_t startPos) {
2980 UErrorCode status = U_ZERO_ERROR;
2981 int32_t pos; // Index of the char following a potential break position
2982 UChar32 thisChar; // Character at above position "pos"
2983
2984 int32_t prevPos; // Index of the char preceding a potential break position
2985 UChar32 prevChar; // Character at above position. Note that prevChar
2986 // and thisChar may not be adjacent because combining
2987 // characters between them will be ignored.
2988
2989 int32_t nextPos; // Index of the next character following pos.
2990 // Usually skips over combining marks.
2991 int32_t nextCPPos; // Index of the code point following "pos."
2992 // May point to a combining mark.
2993 int32_t tPos; // temp value.
2994 UChar32 c;
2995
2996 if (startPos >= fText->length()) {
2997 return -1;
2998 }
2999
3000
3001 // Initial values for loop. Loop will run the first time without finding breaks,
3002 // while the invalid values shift out and the "this" and
3003 // "prev" positions are filled in with good values.
3004 pos = prevPos = -1; // Invalid value, serves as flag for initial loop iteration.
3005 thisChar = prevChar = 0;
3006 nextPos = nextCPPos = startPos;
3007
3008
3009 // Loop runs once per position in the test text, until a break position
3010 // is found.
3011 for (;;) {
3012 prevPos = pos;
3013 prevChar = thisChar;
3014
3015 pos = nextPos;
3016 thisChar = fText->char32At(pos);
3017
3018 nextCPPos = fText->moveIndex32(pos, 1);
3019 nextPos = nextCPPos;
3020
3021 // Rule LB2 - Break at end of text.
3022 if (pos >= fText->length()) {
3023 break;
3024 }
3025
3026 // Rule LB 9 - adjust for combining sequences.
3027 // We do this one out-of-order because the adjustment does not change anything
3028 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3029 // be applied.
3030 rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
3031 nextCPPos = nextPos = fText->moveIndex32(pos, 1);
3032 c = fText->char32At(nextPos);
3033 rule9Adjust(pos, &thisChar, &nextPos, &c);
3034
3035 // If the loop is still warming up - if we haven't shifted the initial
3036 // -1 positions out of prevPos yet - loop back to advance the
3037 // position in the input without any further looking for breaks.
3038 if (prevPos == -1) {
3039 continue;
3040 }
3041
3042 // LB 4 Always break after hard line breaks,
3043 if (fBK->contains(prevChar)) {
3044 break;
3045 }
3046
3047 // LB 5 Break after CR, LF, NL, but not inside CR LF
3048 if (prevChar == 0x0d && thisChar == 0x0a) {
3049 continue;
3050 }
3051 if (prevChar == 0x0d ||
3052 prevChar == 0x0a ||
3053 prevChar == 0x85) {
3054 break;
3055 }
3056
3057 // LB 6 Don't break before hard line breaks
3058 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3059 fBK->contains(thisChar)) {
3060 continue;
3061 }
3062
3063
3064 // LB 7 Don't break before spaces or zero-width space.
3065 if (fSP->contains(thisChar)) {
3066 continue;
3067 }
3068
3069 if (fZW->contains(thisChar)) {
3070 continue;
3071 }
3072
3073 // LB 8 Break after zero width space
3074 if (fZW->contains(prevChar)) {
3075 break;
3076 }
3077
3078 // LB 9, 10 Already done, at top of loop.
3079 //
3080
3081
3082 // LB 11 Do not break before or after WORD JOINER and related characters.
3083 // x WJ
3084 // WJ x
3085 //
3086 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3087 continue;
3088 }
3089
3090 // LB 12
3091 // (!SP) x GL
3092 // GL x
3093 if ((!fSP->contains(prevChar)) && fGL->contains(thisChar) ||
3094 fGL->contains(prevChar)) {
3095 continue;
3096 }
3097
3098
3099
3100 // LB 13 Don't break before closings.
3101 // NU x CL and NU x IS are not matched here so that they will
3102 // fall into LB 17 and the more general number regular expression.
3103 //
3104 if (!fNU->contains(prevChar) && fCL->contains(thisChar) ||
3105 fEX->contains(thisChar) ||
3106 !fNU->contains(prevChar) && fIS->contains(thisChar) ||
3107 !fNU->contains(prevChar) && fSY->contains(thisChar)) {
3108 continue;
3109 }
3110
3111 // LB 14 Don't break after OP SP*
3112 // Scan backwards, checking for this sequence.
3113 // The OP char could include combining marks, so we actually check for
3114 // OP CM* SP*
3115 // Another Twist: The Rule 67 fixes may have changed a SP CM
3116 // sequence into a ID char, so before scanning back through spaces,
3117 // verify that prevChar is indeed a space. The prevChar variable
3118 // may differ from fText[prevPos]
3119 tPos = prevPos;
3120 if (fSP->contains(prevChar)) {
3121 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3122 tPos=fText->moveIndex32(tPos, -1);
3123 }
3124 }
3125 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3126 tPos=fText->moveIndex32(tPos, -1);
3127 }
3128 if (fOP->contains(fText->char32At(tPos))) {
3129 continue;
3130 }
3131
3132
3133 // LB 15 QU SP* x OP
3134 if (fOP->contains(thisChar)) {
3135 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3136 int tPos = prevPos;
3137 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3138 tPos = fText->moveIndex32(tPos, -1);
3139 }
3140 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3141 tPos = fText->moveIndex32(tPos, -1);
3142 }
3143 if (fQU->contains(fText->char32At(tPos))) {
3144 continue;
3145 }
3146 }
3147
3148
3149
3150 // LB 16 CL SP* x NS
3151 // Scan backwards for SP* CM* CL
3152 if (fNS->contains(thisChar)) {
3153 int tPos = prevPos;
3154 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3155 tPos = fText->moveIndex32(tPos, -1);
3156 }
3157 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3158 tPos = fText->moveIndex32(tPos, -1);
3159 }
3160 if (fCL->contains(fText->char32At(tPos))) {
3161 continue;
3162 }
3163 }
3164
3165
3166 // LB 17 B2 SP* x B2
3167 if (fB2->contains(thisChar)) {
3168 // Scan backwards, checking for the B2 CM* SP* sequence.
3169 tPos = prevPos;
3170 if (fSP->contains(prevChar)) {
3171 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3172 tPos=fText->moveIndex32(tPos, -1);
3173 }
3174 }
3175 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3176 tPos=fText->moveIndex32(tPos, -1);
3177 }
3178 if (fB2->contains(fText->char32At(tPos))) {
3179 continue;
3180 }
3181 }
3182
3183
3184 // LB 18 break after space
3185 if (fSP->contains(prevChar)) {
3186 break;
3187 }
3188
3189 // LB 19
3190 // x QU
3191 // QU x
3192 if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3193 continue;
3194 }
3195
3196 // LB 20 Break around a CB
3197 if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3198 break;
3199 }
3200
3201 // LB 21
3202 if (fBA->contains(thisChar) ||
3203 fHY->contains(thisChar) ||
3204 fNS->contains(thisChar) ||
3205 fBB->contains(prevChar) ) {
3206 continue;
3207 }
3208
3209 // LB 22
3210 if (fAL->contains(prevChar) && fIN->contains(thisChar) ||
3211 fID->contains(prevChar) && fIN->contains(thisChar) ||
3212 fIN->contains(prevChar) && fIN->contains(thisChar) ||
3213 fNU->contains(prevChar) && fIN->contains(thisChar) ) {
3214 continue;
3215 }
3216
3217
3218 // LB 23 ID x PO
3219 // AL x NU
3220 // NU x AL
3221 if (fID->contains(prevChar) && fPO->contains(thisChar) ||
3222 fAL->contains(prevChar) && fNU->contains(thisChar) ||
3223 fNU->contains(prevChar) && fAL->contains(thisChar) ) {
3224 continue;
3225 }
3226
3227 // LB 24 Do not break between prefix and letters or ideographs.
3228 // PR x ID
3229 // PR x AL
3230 // PO x AL
3231 if (fPR->contains(prevChar) && fID->contains(thisChar) ||
3232 fPR->contains(prevChar) && fAL->contains(thisChar) ||
3233 fPO->contains(prevChar) && fAL->contains(thisChar) ) {
3234 continue;
3235 }
3236
3237
3238
3239 // LB 25 Numbers
3240 if (fNumberMatcher->lookingAt(prevPos, status)) {
3241 if (U_FAILURE(status)) {
3242 break;
3243 }
3244 // Matched a number. But could have been just a single digit, which would
3245 // not represent a "no break here" between prevChar and thisChar
3246 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num
3247 if (numEndIdx > pos) {
3248 // Number match includes at least our two chars being checked
3249 if (numEndIdx > nextPos) {
3250 // Number match includes additional chars. Update pos and nextPos
3251 // so that next loop iteration will continue at the end of the number,
3252 // checking for breaks between last char in number & whatever follows.
3253 pos = nextPos = numEndIdx;
3254 do {
3255 pos = fText->moveIndex32(pos, -1);
3256 thisChar = fText->char32At(pos);
3257 } while (fCM->contains(thisChar));
3258 }
3259 continue;
3260 }
3261 }
3262
3263
3264 // LB 26 Do not break a Korean syllable.
3265 if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3266 fJV->contains(thisChar) ||
3267 fH2->contains(thisChar) ||
3268 fH3->contains(thisChar))) {
3269 continue;
3270 }
3271
3272 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) &&
3273 (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3274 continue;
3275 }
3276
3277 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3278 fJT->contains(thisChar)) {
3279 continue;
3280 }
3281
3282 // LB 27 Treat a Korean Syllable Block the same as ID.
3283 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3284 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3285 fIN->contains(thisChar)) {
3286 continue;
3287 }
3288 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3289 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3290 fPO->contains(thisChar)) {
3291 continue;
3292 }
3293 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3294 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3295 continue;
3296 }
3297
3298
3299
3300 // LB 28 Do not break between alphabetics (“at”).
3301 if (fAL->contains(prevChar) && fAL->contains(thisChar)) {
3302 continue;
3303 }
3304
3305 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g.").
3306 if (fIS->contains(prevChar) && fAL->contains(thisChar)) {
3307 continue;
3308 }
3309
3310 //LB 30 Do not break between letters, numbers or ordinary symbols and opening or closing punctuation
3311 // (AL | NU) x OP
3312 // CL x (AL | NU)
3313 if ((fAL->contains(prevChar) || fNU->contains(prevChar)) &&
3314 fOP->contains(thisChar)) {
3315 continue;
3316 }
3317 if (fCL->contains(prevChar) &&
3318 (fAL->contains(thisChar) || fNU->contains(thisChar))) {
3319 continue;
3320 }
3321
3322
3323 // LB 31 Break everywhere else
3324 break;
3325
3326 }
3327
3328 return pos;
3329}
3330
3331
3332UVector *RBBILineMonkey::charClasses() {
3333 return fSets;
3334}
3335
3336
3337RBBILineMonkey::~RBBILineMonkey() {
3338 delete fSets;
3339
3340 delete fBK;
3341 delete fCR;
3342 delete fLF;
3343 delete fCM;
3344 delete fNL;
3345 delete fWJ;
3346 delete fZW;
3347 delete fGL;
3348 delete fCB;
3349 delete fSP;
3350 delete fB2;
3351 delete fBA;
3352 delete fBB;
3353 delete fHY;
3354 delete fH2;
3355 delete fH3;
3356 delete fCL;
3357 delete fEX;
3358 delete fIN;
3359 delete fJL;
3360 delete fJV;
3361 delete fJT;
3362 delete fNS;
3363 delete fOP;
3364 delete fQU;
3365 delete fIS;
3366 delete fNU;
3367 delete fPO;
3368 delete fPR;
3369 delete fSY;
3370 delete fAI;
3371 delete fAL;
3372 delete fID;
3373 delete fSA;
3374 delete fSG;
3375 delete fXX;
3376
3377 delete fCharBI;
3378 delete fNumberMatcher;
3379}
3380
3381
3382//-------------------------------------------------------------------------------------------
3383//
3384// TestMonkey
3385//
3386// params
3387// seed=nnnnn Random number starting seed.
3388// Setting the seed allows errors to be reproduced.
3389// loop=nnn Looping count. Controls running time.
3390// -1: run forever.
3391// 0 or greater: run length.
3392//
3393// type = char | word | line | sent | title
3394//
3395//-------------------------------------------------------------------------------------------
3396
3397static int32_t getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3398 int32_t val = defaultVal;
3399 name.append(" *= *(-?\\d+)");
3400 UErrorCode status = U_ZERO_ERROR;
3401 RegexMatcher m(name, params, 0, status);
3402 if (m.find()) {
3403 // The param exists. Convert the string to an int.
3404 char valString[100];
3405 int32_t paramLength = m.end(1, status) - m.start(1, status);
3406 if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3407 paramLength = (int32_t)(sizeof(valString)-2);
3408 }
3409 params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3410 val = strtol(valString, NULL, 10);
3411
3412 // Delete this parameter from the params string.
3413 m.reset();
3414 params = m.replaceFirst("", status);
3415 }
3416 U_ASSERT(U_SUCCESS(status));
3417 return val;
3418}
3419#endif
3420
3421static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3422 BreakIterator *bi,
3423 int expected[],
3424 int expectedcount)
3425{
3426 int count = 0;
3427 int i = 0;
3428 int forward[50];
3429 bi->setText(ustr);
3430 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3431 forward[count] = i;
3432 if (count < expectedcount && expected[count] != i) {
3433 test->errln("break forward test failed: expected %d but got %d",
3434 expected[count], i);
3435 break;
3436 }
3437 count ++;
3438 }
3439 if (count != expectedcount) {
3440 printStringBreaks(ustr, expected, expectedcount);
3441 test->errln("break forward test failed: missed %d match",
3442 expectedcount - count);
3443 return;
3444 }
3445 // testing boundaries
3446 for (i = 1; i < expectedcount; i ++) {
3447 int j = expected[i - 1];
3448 if (!bi->isBoundary(j)) {
3449 printStringBreaks(ustr, expected, expectedcount);
3450 test->errln("isBoundary() failed. Expected boundary at position %d", j);
3451 return;
3452 }
3453 for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3454 if (bi->isBoundary(j)) {
3455 printStringBreaks(ustr, expected, expectedcount);
3456 test->errln("isBoundary() failed. Not expecting boundary at position %d", j);
3457 return;
3458 }
3459 }
3460 }
3461
3462 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3463 count --;
3464 if (forward[count] != i) {
3465 test->errln("happy break test previous() failed: expected %d but got %d",
3466 forward[count], i);
3467 break;
3468 }
3469 }
3470 if (count != 0) {
3471 printStringBreaks(ustr, expected, expectedcount);
3472 test->errln("break test previous() failed: missed a match");
3473 return;
3474 }
3475
3476 // testing preceding
3477 for (i = 0; i < expectedcount - 1; i ++) {
3478 // int j = expected[i] + 1;
3479 int j = ustr.moveIndex32(expected[i], 1);
3480 for (; j <= expected[i + 1]; j ++) {
3481 if (bi->preceding(j) != expected[i]) {
3482 printStringBreaks(ustr, expected, expectedcount);
3483 test->errln("preceding(): Not expecting boundary at position %d", j);
3484 return;
3485 }
3486 }
3487 }
3488}
3489
3490void RBBITest::TestWordBreaks(void)
3491{
3492#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3493
3494 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3495 Locale locale("en");
3496 UErrorCode status = U_ZERO_ERROR;
3497 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3498 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3499 UChar str[300];
3500 static const char *strlist[] =
3501 {
3502 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3503 "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b",
3504 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u179c\\u0027\\U000e0061\\u003a",
3505 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3506 "\\u90ca\\u3588\\u009c\\u0953\\u194b",
3507 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3508 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3509 "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",
3510 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3511 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3512 "\\u2027\\U000e0067\\u0a47\\u00b7",
3513 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3514 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3515 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3516 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3517 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3518 "\\u0027\\u11af\\U000e0057\\u0602",
3519 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3520 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3521 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3522 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3523 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3524 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3525 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3526 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3527 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3528 "\\u58f4\\U000e0049\\u20e7\\u2027",
3529 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3530 "\\ua183\\u102d\\u0bec\\u003a",
3531 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3532 "\\u003a\\u0e57\\u0fad\\u002e",
3533 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3534 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3535 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3536 "\\u003a\\u0664\\u00b7\\u1fba",
3537 "\\u003b\\u0027\\u00b7\\u47a3",
3538 "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",
3539 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3540 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3541 };
3542 int loop;
3543 if (U_FAILURE(status)) {
3544 errln("Creation of break iterator failed %s", u_errorName(status));
3545 return;
3546 }
3547 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3548 // printf("looping %d\n", loop);
3549 u_unescape(strlist[loop], str, 25);
3550 UnicodeString ustr(str);
3551 // RBBICharMonkey monkey;
3552 RBBIWordMonkey monkey;
3553
3554 int expected[50];
3555 int expectedcount = 0;
3556
3557 monkey.setText(ustr);
3558 int i;
3559 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3560 expected[expectedcount ++] = i;
3561 }
3562
3563 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3564 }
3565 delete bi;
3566#endif
3567}
3568
3569void RBBITest::TestWordBoundary(void)
3570{
3571 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3572 Locale locale("en");
3573 UErrorCode status = U_ZERO_ERROR;
3574 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3575 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3576 UChar str[50];
3577 static const char *strlist[] =
3578 {
3579 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3580 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3581 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3582 "\\u2027\\U000e0067\\u0a47\\u00b7",
3583 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3584 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3585 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3586 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3587 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3588 "\\u0027\\u11af\\U000e0057\\u0602",
3589 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3590 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3591 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3592 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3593 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3594 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3595 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3596 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3597 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3598 "\\u58f4\\U000e0049\\u20e7\\u2027",
3599 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3600 "\\ua183\\u102d\\u0bec\\u003a",
3601 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3602 "\\u003a\\u0e57\\u0fad\\u002e",
3603 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3604 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3605 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3606 "\\u003a\\u0664\\u00b7\\u1fba",
3607 "\\u003b\\u0027\\u00b7\\u47a3",
3608 };
3609 int loop;
3610 if (U_FAILURE(status)) {
3611 errln("Creation of break iterator failed %s", u_errorName(status));
3612 return;
3613 }
3614 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3615 // printf("looping %d\n", loop);
3616 u_unescape(strlist[loop], str, 20);
3617 UnicodeString ustr(str);
3618 int forward[50];
3619 int count = 0;
3620
3621 bi->setText(ustr);
3622 int prev = 0;
3623 int i;
3624 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3625 forward[count ++] = i;
3626 if (i > prev) {
3627 int j;
3628 for (j = prev + 1; j < i; j ++) {
3629 if (bi->isBoundary(j)) {
3630 printStringBreaks(ustr, forward, count);
3631 errln("happy boundary test failed: expected %d not a boundary",
3632 j);
3633 return;
3634 }
3635 }
3636 }
3637 if (!bi->isBoundary(i)) {
3638 printStringBreaks(ustr, forward, count);
3639 errln("happy boundary test failed: expected %d a boundary",
3640 i);
3641 return;
3642 }
3643 prev = i;
3644 }
3645 }
3646 delete bi;
3647}
3648
3649void RBBITest::TestLineBreaks(void)
3650{
3651#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3652 Locale locale("en");
3653 UErrorCode status = U_ZERO_ERROR;
3654 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3655 const int32_t STRSIZE = 50;
3656 UChar str[STRSIZE];
3657 static const char *strlist[] =
3658 {
3659 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3660 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3661 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3662 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3663 "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3664 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3665 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3666 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3667 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3668 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3669 "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
3670 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3671 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3672 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3673 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3674 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3675 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3676 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3677 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3678 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3679 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3680 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3681 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3682 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3683 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3684 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3685 "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
3686 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3687 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3688 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3689 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3690 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3691 "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
3692 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3693 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3694 "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
3695 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3696 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3697 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3698 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3699 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3700 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3701 "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
3702 "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
3703 "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
3704 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3705 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3706 };
3707 int loop;
3708 TEST_ASSERT_SUCCESS(status);
3709 if (U_FAILURE(status)) {
3710 return;
3711 }
3712 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3713 // printf("looping %d\n", loop);
3714 int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3715 if (t >= STRSIZE) {
3716 TEST_ASSERT(FALSE);
3717 continue;
3718 }
3719
3720
3721 UnicodeString ustr(str);
3722 RBBILineMonkey monkey;
3723 if (U_FAILURE(monkey.deferredStatus)) {
3724 continue;
3725 }
3726
3727 const int EXPECTEDSIZE = 50;
3728 int expected[EXPECTEDSIZE];
3729 int expectedcount = 0;
3730
3731 monkey.setText(ustr);
3732 int i;
3733 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3734 if (expectedcount >= EXPECTEDSIZE) {
3735 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3736 return;
3737 }
3738 expected[expectedcount ++] = i;
3739 }
3740
3741 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3742 }
3743 delete bi;
3744#endif
3745}
3746
3747void RBBITest::TestSentBreaks(void)
3748{
3749#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3750 Locale locale("en");
3751 UErrorCode status = U_ZERO_ERROR;
3752 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3753 UChar str[200];
3754 static const char *strlist[] =
3755 {
3756 "Now\ris\nthe\r\ntime\n\rfor\r\r",
3757 "This\n",
3758 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3759 "\"Sentence ending with a quote.\" Bye.",
3760 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
3761 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3762 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3763 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3764 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3765 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3766 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3767 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3768 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3769 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3770 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3771 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3772 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3773 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3774 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3775 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3776 };
3777 int loop;
3778 if (U_FAILURE(status)) {
3779 errln("Creation of break iterator failed %s", u_errorName(status));
3780 return;
3781 }
3782 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3783 u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
3784 UnicodeString ustr(str);
3785
3786 RBBISentMonkey monkey;
3787 if (U_FAILURE(monkey.deferredStatus)) {
3788 continue;
3789 }
3790
3791 const int EXPECTEDSIZE = 50;
3792 int expected[EXPECTEDSIZE];
3793 int expectedcount = 0;
3794
3795 monkey.setText(ustr);
3796 int i;
3797 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3798 if (expectedcount >= EXPECTEDSIZE) {
3799 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3800 return;
3801 }
3802 expected[expectedcount ++] = i;
3803 }
3804
3805 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3806 }
3807 delete bi;
3808#endif
3809}
3810
3811void RBBITest::TestMonkey(char *params) {
3812#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3813
3814 UErrorCode status = U_ZERO_ERROR;
3815 int32_t loopCount = 500;
3816 int32_t seed = 1;
3817 UnicodeString breakType = "all";
3818 Locale locale("en");
3819 UBool useUText = FALSE;
3820
3821 if (quick == FALSE) {
3822 loopCount = 10000;
3823 }
3824
3825 if (params) {
3826 UnicodeString p(params);
3827 loopCount = getIntParam("loop", p, loopCount);
3828 seed = getIntParam("seed", p, seed);
3829
3830 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3831 if (m.find()) {
3832 breakType = m.group(1, status);
3833 m.reset();
3834 p = m.replaceFirst("", status);
3835 }
3836
3837 RegexMatcher u(" *utext", p, 0, status);
3838 if (u.find()) {
3839 useUText = TRUE;
3840 u.reset();
3841 p = u.replaceFirst("", status);
3842 }
3843
3844
3845 // m.reset(p);
3846 if (RegexMatcher("\\S", p, 0, status).find()) {
3847 // Each option is stripped out of the option string as it is processed.
3848 // All options have been checked. The option string should have been completely emptied..
3849 char buf[100];
3850 p.extract(buf, sizeof(buf), NULL, status);
3851 buf[sizeof(buf)-1] = 0;
3852 errln("Unrecognized or extra parameter: %s\n", buf);
3853 return;
3854 }
3855
3856 }
3857
3858 if (breakType == "char" || breakType == "all") {
3859 RBBICharMonkey m;
3860 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3861 if (U_SUCCESS(status)) {
3862 RunMonkey(bi, m, "char", seed, loopCount, useUText);
3863 if (breakType == "all" && useUText==FALSE) {
3864 // Also run a quick test with UText when "all" is specified
3865 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
3866 }
3867 }
3868 else {
3869 errln("Creation of character break iterator failed %s", u_errorName(status));
3870 }
3871 delete bi;
3872 }
3873
3874 if (breakType == "word" || breakType == "all") {
3875 logln("Word Break Monkey Test");
3876 RBBIWordMonkey m;
3877 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3878 if (U_SUCCESS(status)) {
3879 RunMonkey(bi, m, "word", seed, loopCount, useUText);
3880 }
3881 else {
3882 errln("Creation of word break iterator failed %s", u_errorName(status));
3883 }
3884 delete bi;
3885 }
3886
3887 if (breakType == "line" || breakType == "all") {
3888 logln("Line Break Monkey Test");
3889 RBBILineMonkey m;
3890 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3891 if (loopCount >= 10) {
3892 loopCount = loopCount / 5; // Line break runs slower than the others.
3893 }
3894 if (U_SUCCESS(status)) {
3895 RunMonkey(bi, m, "line", seed, loopCount, useUText);
3896 }
3897 else {
3898 errln("Creation of line break iterator failed %s", u_errorName(status));
3899 }
3900 delete bi;
3901 }
3902
3903 if (breakType == "sent" || breakType == "all" ) {
3904 logln("Sentence Break Monkey Test");
3905 RBBISentMonkey m;
3906 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3907 if (loopCount >= 10) {
3908 loopCount = loopCount / 10; // Sentence runs slower than the other break types
3909 }
3910 if (U_SUCCESS(status)) {
3911 RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
3912 }
3913 else {
3914 errln("Creation of line break iterator failed %s", u_errorName(status));
3915 }
3916 delete bi;
3917 }
3918
3919#endif
3920}
3921
3922//
3923// Run a RBBI monkey test. Common routine, for all break iterator types.
3924// Parameters:
3925// bi - the break iterator to use
3926// mk - MonkeyKind, abstraction for obtaining expected results
3927// name - Name of test (char, word, etc.) for use in error messages
3928// seed - Seed for starting random number generator (parameter from user)
3929// numIterations
3930//
3931void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed,
3932 int32_t numIterations, UBool useUText) {
3933
3934#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3935
3936 const int32_t TESTSTRINGLEN = 500;
3937 UnicodeString testText;
3938 int32_t numCharClasses;
3939 UVector *chClasses;
3940 int expected[TESTSTRINGLEN*2 + 1];
3941 int expectedCount = 0;
3942 char expectedBreaks[TESTSTRINGLEN*2 + 1];
3943 char forwardBreaks[TESTSTRINGLEN*2 + 1];
3944 char reverseBreaks[TESTSTRINGLEN*2+1];
3945 char isBoundaryBreaks[TESTSTRINGLEN*2+1];
3946 char followingBreaks[TESTSTRINGLEN*2+1];
3947 char precedingBreaks[TESTSTRINGLEN*2+1];
3948 int i;
3949 int loopCount = 0;
3950
3951 m_seed = seed;
3952
3953 numCharClasses = mk.charClasses()->size();
3954 chClasses = mk.charClasses();
3955
3956 // Check for errors that occured during the construction of the MonkeyKind object.
3957 // Can't report them where they occured because errln() is a method coming from intlTest,
3958 // and is not visible outside of RBBITest :-(
3959 if (U_FAILURE(mk.deferredStatus)) {
3960 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
3961 return;
3962 }
3963
3964 // Verify that the character classes all have at least one member.
3965 for (i=0; i<numCharClasses; i++) {
3966 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
3967 if (s == NULL || s->size() == 0) {
3968 errln("Character Class #%d is null or of zero size.", i);
3969 return;
3970 }
3971 }
3972
3973 while (loopCount < numIterations || numIterations == -1) {
3974 if (numIterations == -1 && loopCount % 10 == 0) {
3975 // If test is running in an infinite loop, display a periodic tic so
3976 // we can tell that it is making progress.
3977 fprintf(stderr, ".");
3978 }
3979 // Save current random number seed, so that we can recreate the random numbers
3980 // for this loop iteration in event of an error.
3981 seed = m_seed;
3982
3983 // Populate a test string with data.
3984 testText.truncate(0);
3985 for (i=0; i<TESTSTRINGLEN; i++) {
3986 int32_t aClassNum = m_rand() % numCharClasses;
3987 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
3988 int32_t charIdx = m_rand() % classSet->size();
3989 UChar32 c = classSet->charAt(charIdx);
3990 if (c < 0) { // TODO: deal with sets containing strings.
3991 errln("c < 0");
3992 break;
3993 }
3994 testText.append(c);
3995 }
3996
3997 // Calculate the expected results for this test string.
3998 mk.setText(testText);
3999 memset(expectedBreaks, 0, sizeof(expectedBreaks));
4000 expectedBreaks[0] = 1;
4001 int32_t breakPos = 0;
4002 expectedCount = 0;
4003 for (;;) {
4004 breakPos = mk.next(breakPos);
4005 if (breakPos == -1) {
4006 break;
4007 }
4008 if (breakPos > testText.length()) {
4009 errln("breakPos > testText.length()");
4010 }
4011 expectedBreaks[breakPos] = 1;
4012 U_ASSERT(expectedCount<testText.length());
4013 expected[expectedCount ++] = breakPos;
4014 }
4015
4016 // Find the break positions using forward iteration
4017 memset(forwardBreaks, 0, sizeof(forwardBreaks));
4018 if (useUText) {
4019 UErrorCode status = U_ZERO_ERROR;
4020 UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4021 // testUText = utext_openUnicodeString(testUText, &testText, &status);
4022 bi->setText(testUText, status);
4023 TEST_ASSERT_SUCCESS(status);
4024 utext_close(testUText); // The break iterator does a shallow clone of the UText
4025 // This UText can be closed immediately, so long as the
4026 // testText string continues to exist.
4027 } else {
4028 bi->setText(testText);
4029 }
4030
4031 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4032 if (i < 0 || i > testText.length()) {
4033 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4034 break;
4035 }
4036 forwardBreaks[i] = 1;
4037 }
4038
4039 // Find the break positions using reverse iteration
4040 memset(reverseBreaks, 0, sizeof(reverseBreaks));
4041 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4042 if (i < 0 || i > testText.length()) {
4043 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4044 break;
4045 }
4046 reverseBreaks[i] = 1;
4047 }
4048
4049 // Find the break positions using isBoundary() tests.
4050 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4051 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4052 for (i=0; i<=testText.length(); i++) {
4053 isBoundaryBreaks[i] = bi->isBoundary(i);
4054 }
4055
4056
4057 // Find the break positions using the following() function.
4058 // printf(".");
4059 memset(followingBreaks, 0, sizeof(followingBreaks));
4060 int32_t lastBreakPos = 0;
4061 followingBreaks[0] = 1;
4062 for (i=0; i<testText.length(); i++) {
4063 breakPos = bi->following(i);
4064 if (breakPos <= i ||
4065 breakPos < lastBreakPos ||
4066 breakPos > testText.length() ||
4067 breakPos > lastBreakPos && lastBreakPos > i ) {
4068 errln("%s break monkey test: "
4069 "Out of range value returned by BreakIterator::following().\n"
4070 "Random seed=%d index=%d; following returned %d; lastbreak=%d",
4071 name, seed, i, breakPos, lastBreakPos);
4072 break;
4073 }
4074 followingBreaks[breakPos] = 1;
4075 lastBreakPos = breakPos;
4076 }
4077
4078 // Find the break positions using the preceding() function.
4079 memset(precedingBreaks, 0, sizeof(followingBreaks));
4080 lastBreakPos = testText.length();
4081 precedingBreaks[testText.length()] = 1;
4082 for (i=testText.length(); i>0; i--) {
4083 breakPos = bi->preceding(i);
4084 if (breakPos >= i ||
4085 breakPos > lastBreakPos ||
4086 breakPos < 0 && testText.getChar32Start(i)>0 ||
4087 breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i) ) {
4088 errln("%s break monkey test: "
4089 "Out of range value returned by BreakIterator::preceding().\n"
4090 "index=%d; prev returned %d; lastBreak=%d" ,
4091 name, i, breakPos, lastBreakPos);
4092 precedingBreaks[i] = 2; // Forces an error.
4093 } else {
4094 precedingBreaks[breakPos] = 1;
4095 lastBreakPos = breakPos;
4096 }
4097 }
4098
4099 // Compare the expected and actual results.
4100 for (i=0; i<=testText.length(); i++) {
4101 const char *errorType = NULL;
4102 if (forwardBreaks[i] != expectedBreaks[i]) {
4103 errorType = "next()";
4104 } else if (reverseBreaks[i] != forwardBreaks[i]) {
4105 errorType = "previous()";
4106 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4107 errorType = "isBoundary()";
4108 } else if (followingBreaks[i] != expectedBreaks[i]) {
4109 errorType = "following()";
4110 } else if (precedingBreaks[i] != expectedBreaks[i]) {
4111 errorType = "preceding()";
4112 }
4113
4114
4115 if (errorType != NULL) {
4116 // Format a range of the test text that includes the failure as
4117 // a data item that can be included in the rbbi test data file.
4118
4119 // Start of the range is the last point where expected and actual results
4120 // both agreed that there was a break position.
4121 int startContext = i;
4122 int32_t count = 0;
4123 for (;;) {
4124 if (startContext==0) { break; }
4125 startContext --;
4126 if (expectedBreaks[startContext] != 0) {
4127 if (count == 2) break;
4128 count ++;
4129 }
4130 }
4131
4132 // End of range is two expected breaks past the start position.
4133 int endContext = i + 1;
4134 int ci;
4135 for (ci=0; ci<2; ci++) { // Number of items to include in error text.
4136 for (;;) {
4137 if (endContext >= testText.length()) {break;}
4138 if (expectedBreaks[endContext-1] != 0) {
4139 if (count == 0) break;
4140 count --;
4141 }
4142 endContext ++;
4143 }
4144 }
4145
4146 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4147 UnicodeString errorText = "<data>";
4148 /***if (strcmp(errorType, "next()") == 0) {
4149 startContext = 0;
4150 endContext = testText.length();
4151
4152 printStringBreaks(testText, expected, expectedCount);
4153 }***/
4154
4155 for (ci=startContext; ci<endContext;) {
4156 UnicodeString hexChars("0123456789abcdef");
4157 UChar32 c;
4158 int bn;
4159 c = testText.char32At(ci);
4160 if (ci == i) {
4161 // This is the location of the error.
4162 errorText.append("<?>");
4163 } else if (expectedBreaks[ci] != 0) {
4164 // This a non-error expected break position.
4165 errorText.append("\\");
4166 }
4167 if (c < 0x10000) {
4168 errorText.append("\\u");
4169 for (bn=12; bn>=0; bn-=4) {
4170 errorText.append(hexChars.charAt((c>>bn)&0xf));
4171 }
4172 } else {
4173 errorText.append("\\U");
4174 for (bn=28; bn>=0; bn-=4) {
4175 errorText.append(hexChars.charAt((c>>bn)&0xf));
4176 }
4177 }
4178 ci = testText.moveIndex32(ci, 1);
4179 }
4180 errorText.append("\\");
4181 errorText.append("</data>\n");
4182
4183 // Output the error
4184 char charErrorTxt[500];
4185 UErrorCode status = U_ZERO_ERROR;
4186 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4187 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4188 errln("%s break monkey test error. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s",
4189 name, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
4190 errorType, seed, i, charErrorTxt);
4191 break;
4192 }
4193 }
4194
4195 loopCount++;
4196 }
4197#endif
4198}
4199
4200//
4201// TestDebug - A place-holder test for debugging purposes.
4202// For putting in fragments of other tests that can be invoked
4203// for tracing without a lot of unwanted extra stuff happening.
4204//
4205void RBBITest::TestDebug(void) {
4206#if 0
4207 UErrorCode status = U_ZERO_ERROR;
4208 int pos = 0;
4209 int ruleStatus = 0;
4210
4211 RuleBasedBreakIterator* bi =
4212 // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
4213 // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
4214 (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
4215 UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
4216 // UnicodeString s("Aaa. Bcd");
4217 s = s.unescape();
4218 bi->setText(s);
4219 UBool r = bi->isBoundary(8);
4220 printf("%s", r?"true":"false");
4221 return;
4222 pos = bi->last();
4223 do {
4224 // ruleStatus = bi->getRuleStatus();
4225 printf("%d\t%d\n", pos, ruleStatus);
4226 pos = bi->previous();
4227 } while (pos != BreakIterator::DONE);
4228#endif
4229}
4230
4231#endif /* #if !UCONFIG_NO_BREAK_ITERATION */