+static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
+ BreakIterator *bi,
+ int expected[],
+ int expectedcount)
+{
+ int count = 0;
+ int i = 0;
+ int forward[50];
+ bi->setText(ustr);
+ for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
+ forward[count] = i;
+ if (count < expectedcount && expected[count] != i) {
+ test->errln("break forward test failed: expected %d but got %d",
+ expected[count], i);
+ break;
+ }
+ count ++;
+ }
+ if (count != expectedcount) {
+ printStringBreaks(ustr, expected, expectedcount);
+ test->errln("break forward test failed: missed %d match",
+ expectedcount - count);
+ return;
+ }
+ // testing boundaries
+ for (i = 1; i < expectedcount; i ++) {
+ int j = expected[i - 1];
+ if (!bi->isBoundary(j)) {
+ printStringBreaks(ustr, expected, expectedcount);
+ test->errln("isBoundary() failed. Expected boundary at position %d", j);
+ return;
+ }
+ for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
+ if (bi->isBoundary(j)) {
+ printStringBreaks(ustr, expected, expectedcount);
+ test->errln("isBoundary() failed. Not expecting boundary at position %d", j);
+ return;
+ }
+ }
+ }
+
+ for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
+ count --;
+ if (forward[count] != i) {
+ test->errln("happy break test previous() failed: expected %d but got %d",
+ forward[count], i);
+ break;
+ }
+ }
+ if (count != 0) {
+ printStringBreaks(ustr, expected, expectedcount);
+ test->errln("break test previous() failed: missed a match");
+ return;
+ }
+
+ // testing preceding
+ for (i = 0; i < expectedcount - 1; i ++) {
+ // int j = expected[i] + 1;
+ int j = ustr.moveIndex32(expected[i], 1);
+ for (; j <= expected[i + 1]; j ++) {
+ if (bi->preceding(j) != expected[i]) {
+ printStringBreaks(ustr, expected, expectedcount);
+ test->errln("preceding(): Not expecting boundary at position %d", j);
+ return;
+ }
+ }
+ }
+}
+
+void RBBITest::TestWordBreaks(void)
+{
+#if !UCONFIG_NO_REGULAR_EXPRESSIONS
+
+ Locale locale("en");
+ UErrorCode status = U_ZERO_ERROR;
+ // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
+ BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
+ static const char *strlist[] =
+ {
+ "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
+ "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b",
+ "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
+ "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
+ "\\u90ca\\u3588\\u009c\\u0953\\u194b",
+ "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
+ "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
+ "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",
+ "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
+ "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
+ "\\u2027\\U000e0067\\u0a47\\u00b7",
+ "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
+ "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
+ "\\u0589\\U000e006e\\u0a42\\U000104a5",
+ "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
+ "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
+ "\\u0027\\u11af\\U000e0057\\u0602",
+ "\\U0001d7f2\\U000e007\\u0004\\u0589",
+ "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
+ "\\U0001d7f2\\U000e007d\\u0004\\u0589",
+ "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
+ "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
+ "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
+ "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
+ "\\u0233\\U000e0020\\u0a69\\u0d6a",
+ "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
+ "\\u58f4\\U000e0049\\u20e7\\u2027",
+ "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
+ "\\ua183\\u102d\\u0bec\\u003a",
+ "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
+ "\\u003a\\u0e57\\u0fad\\u002e",
+ "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
+ "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
+ "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
+ "\\u003a\\u0664\\u00b7\\u1fba",
+ "\\u003b\\u0027\\u00b7\\u47a3",
+ "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",
+ "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
+ "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
+ };
+ int loop;
+ if (U_FAILURE(status)) {
+ errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
+ return;
+ }
+ for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
+ // printf("looping %d\n", loop);
+ UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
+ // RBBICharMonkey monkey;
+ RBBIWordMonkey monkey;
+
+ int expected[50];
+ int expectedcount = 0;
+
+ monkey.setText(ustr);
+ int i;
+ for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
+ expected[expectedcount ++] = i;
+ }
+
+ testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
+ }
+ delete bi;
+#endif
+}
+
+void RBBITest::TestWordBoundary(void)
+{
+ // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
+ Locale locale("en");
+ UErrorCode status = U_ZERO_ERROR;
+ // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
+ BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
+ UChar str[50];
+ static const char *strlist[] =
+ {
+ "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
+ "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
+ "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
+ "\\u2027\\U000e0067\\u0a47\\u00b7",
+ "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
+ "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
+ "\\u0589\\U000e006e\\u0a42\\U000104a5",
+ "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
+ "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
+ "\\u0027\\u11af\\U000e0057\\u0602",
+ "\\U0001d7f2\\U000e007\\u0004\\u0589",
+ "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
+ "\\U0001d7f2\\U000e007d\\u0004\\u0589",
+ "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
+ "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
+ "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
+ "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
+ "\\u0233\\U000e0020\\u0a69\\u0d6a",
+ "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
+ "\\u58f4\\U000e0049\\u20e7\\u2027",
+ "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
+ "\\ua183\\u102d\\u0bec\\u003a",
+ "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
+ "\\u003a\\u0e57\\u0fad\\u002e",
+ "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
+ "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
+ "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
+ "\\u003a\\u0664\\u00b7\\u1fba",
+ "\\u003b\\u0027\\u00b7\\u47a3",
+ };
+ int loop;
+ if (U_FAILURE(status)) {
+ errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
+ return;
+ }
+ for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
+ // printf("looping %d\n", loop);
+ u_unescape(strlist[loop], str, 20);
+ UnicodeString ustr(str);
+ int forward[50];
+ int count = 0;
+
+ bi->setText(ustr);
+ int prev = 0;
+ int i;
+ for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
+ forward[count ++] = i;
+ if (i > prev) {
+ int j;
+ for (j = prev + 1; j < i; j ++) {
+ if (bi->isBoundary(j)) {
+ printStringBreaks(ustr, forward, count);
+ errln("happy boundary test failed: expected %d not a boundary",
+ j);
+ return;
+ }
+ }
+ }
+ if (!bi->isBoundary(i)) {
+ printStringBreaks(ustr, forward, count);
+ errln("happy boundary test failed: expected %d a boundary",
+ i);
+ return;
+ }
+ prev = i;
+ }
+ }
+ delete bi;
+}
+
+void RBBITest::TestLineBreaks(void)
+{
+#if !UCONFIG_NO_REGULAR_EXPRESSIONS
+ Locale locale("en");
+ UErrorCode status = U_ZERO_ERROR;
+ BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
+ const int32_t STRSIZE = 50;
+ UChar str[STRSIZE];
+ static const char *strlist[] =
+ {
+ "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
+ "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
+ "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
+ "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
+ "u2014\\U000e0105\\u118c\\u000a\\u07f8",
+ "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
+ "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
+ "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
+ "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
+ "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
+ "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
+ "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
+ "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
+ "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
+ "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
+ "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
+ "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
+ "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
+ "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
+ "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
+ "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
+ "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
+ "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
+ "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
+ "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
+ "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
+ "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
+ "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
+ "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
+ "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
+ "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
+ "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
+ "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
+ "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
+ "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
+ "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
+ "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
+ "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
+ "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
+ "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
+ "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
+ "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
+ "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
+ "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
+ "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
+ "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
+ "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
+ };
+ int loop;
+ TEST_ASSERT_SUCCESS(status);
+ if (U_FAILURE(status)) {
+ return;
+ }
+ for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
+ // printf("looping %d\n", loop);
+ int32_t t = u_unescape(strlist[loop], str, STRSIZE);
+ if (t >= STRSIZE) {
+ TEST_ASSERT(FALSE);
+ continue;
+ }
+
+
+ UnicodeString ustr(str);
+ RBBILineMonkey monkey;
+ if (U_FAILURE(monkey.deferredStatus)) {
+ continue;
+ }
+
+ const int EXPECTEDSIZE = 50;
+ int expected[EXPECTEDSIZE];
+ int expectedcount = 0;
+
+ monkey.setText(ustr);
+ int i;
+ for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
+ if (expectedcount >= EXPECTEDSIZE) {
+ TEST_ASSERT(expectedcount < EXPECTEDSIZE);
+ return;
+ }
+ expected[expectedcount ++] = i;
+ }
+
+ testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
+ }
+ delete bi;
+#endif
+}
+
+void RBBITest::TestSentBreaks(void)
+{
+#if !UCONFIG_NO_REGULAR_EXPRESSIONS
+ Locale locale("en");
+ UErrorCode status = U_ZERO_ERROR;
+ BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
+ UChar str[200];
+ static const char *strlist[] =
+ {
+ "Now\ris\nthe\r\ntime\n\rfor\r\r",
+ "This\n",
+ "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
+ "\"Sentence ending with a quote.\" Bye.",
+ " (This is it). Testing the sentence iterator. \"This isn't it.\"",
+ "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
+ "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
+ "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
+ "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
+ "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
+ "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
+ "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
+ "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
+ "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
+ "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
+ "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
+ "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
+ "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
+ "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
+ "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
+ };
+ int loop;
+ if (U_FAILURE(status)) {
+ errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
+ return;
+ }
+ for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
+ u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
+ UnicodeString ustr(str);
+
+ RBBISentMonkey monkey;
+ if (U_FAILURE(monkey.deferredStatus)) {
+ continue;
+ }
+
+ const int EXPECTEDSIZE = 50;
+ int expected[EXPECTEDSIZE];
+ int expectedcount = 0;
+
+ monkey.setText(ustr);
+ int i;
+ for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
+ if (expectedcount >= EXPECTEDSIZE) {
+ TEST_ASSERT(expectedcount < EXPECTEDSIZE);
+ return;
+ }
+ expected[expectedcount ++] = i;
+ }
+
+ testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
+ }
+ delete bi;
+#endif
+}
+