+void RBBITest::TestThaiBreaks() {
+ UErrorCode status=U_ZERO_ERROR;
+ BreakIterator* b;
+ Locale locale = Locale("th");
+ int32_t p, index;
+ UChar c[]= {
+ 0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B,
+ 0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19,
+ 0x0E16, 0x0E49, 0x0E33, 0x0000
+ };
+ int32_t expectedWordResult[] = {
+ 2, 3, 6, 10, 11, 15, 17, 20, 22
+ };
+ int32_t expectedLineResult[] = {
+ 3, 6, 11, 15, 17, 20, 22
+ };
+
+ int32_t size = u_strlen(c);
+ UnicodeString text=UnicodeString(c);
+
+ b = BreakIterator::createWordInstance(locale, status);
+ if (U_FAILURE(status)) {
+ errcheckln(status, "Unable to create thai word break iterator. - %s", u_errorName(status));
+ return;
+ }
+ b->setText(text);
+ p = index = 0;
+ while ((p=b->next())!=BreakIterator::DONE && p < size) {
+ if (p != expectedWordResult[index++]) {
+ errln("Incorrect break given by thai word break iterator. Expected: %d Got: %d", expectedWordResult[index-1], p);
+ }
+ }
+ delete b;
+
+ b = BreakIterator::createLineInstance(locale, status);
+ if (U_FAILURE(status)) {
+ printf("Unable to create thai line break iterator.\n");
+ return;
+ }
+ b->setText(text);
+ p = index = 0;
+ while ((p=b->next())!=BreakIterator::DONE && p < size) {
+ if (p != expectedLineResult[index++]) {
+ errln("Incorrect break given by thai line break iterator. Expected: %d Got: %d", expectedLineResult[index-1], p);
+ }
+ }
+
+ delete b;
+}
+
+// UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX"
+// Words don't include colon or period (cldrbug #1969).
+static const char posxWordText[] = "Can't have breaks in xx:yy or struct.field for CS-types.";
+static const int32_t posxWordTOffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21, 23, 24, 26, 27, 29, 30, 36, 37, 42, 43, 46, 47, 49, 50, 55, 56 };
+static const int32_t posxWordROffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21, 26, 27, 29, 30, 42, 43, 46, 47, 49, 50, 55, 56 };
+
+// UBreakIteratorType UBRK_WORD, Locale "ja"
+// Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009).
+static const char jaWordText[] = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u3007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF"
+ "\\u304C\\u3042\\u308B\\u3002\\u5948\\u3005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002";
+static const int32_t jaWordTOffsets[] = { 2, 3, 7, 8, 14, 17, 18, 20, 21, 24, 27, 28 };
+static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28 };
+
+// UBreakIteratorType UBRK_SENTENCE, Locale "el"
+// Add break after Greek question mark (cldrbug #2069).
+static const char elSentText[] = "\\u0391\\u03B2, \\u03B3\\u03B4; \\u0395 \\u03B6\\u03B7\\u037E \\u0398 \\u03B9\\u03BA. "
+ "\\u039B\\u03BC \\u03BD\\u03BE! \\u039F\\u03C0, \\u03A1\\u03C2? \\u03A3";
+static const int32_t elSentTOffsets[] = { 8, 14, 20, 27, 35, 36 };
+static const int32_t elSentROffsets[] = { 20, 27, 35, 36 };
+
+// UBreakIteratorType UBRK_CHARACTER, Locale "th"
+// Clusters should not include spacing Thai/Lao vowels (prefix or postfix), except for [SARA] AM (cldrbug #2161).
+static const char thCharText[] = "\\u0E01\\u0E23\\u0E30\\u0E17\\u0E48\\u0E2D\\u0E21\\u0E23\\u0E08\\u0E19\\u0E32 "
+ "(\\u0E2A\\u0E38\\u0E0A\\u0E32\\u0E15\\u0E34-\\u0E08\\u0E38\\u0E11\\u0E32\\u0E21\\u0E32\\u0E28) "
+ "\\u0E40\\u0E14\\u0E47\\u0E01\\u0E21\\u0E35\\u0E1B\\u0E31\\u0E0D\\u0E2B\\u0E32 ";
+static const int32_t thCharTOffsets[] = { 1, 2, 3, 5, 6, 7, 8, 9, 10, 11,
+ 12, 13, 15, 16, 17, 19, 20, 22, 23, 24, 25, 26, 27, 28,
+ 29, 30, 32, 33, 35, 37, 38, 39, 40, 41 };
+static const int32_t thCharROffsets[] = { 1, 3, 5, 6, 7, 8, 9, 11,
+ 12, 13, 15, 17, 19, 20, 22, 24, 26, 27, 28,
+ 29, 32, 33, 35, 37, 38, 40, 41 };
+
+typedef struct {
+ UBreakIteratorType type;
+ const char * locale;
+ const char * escapedText;
+ const int32_t * tailoredOffsets;
+ int32_t tailoredOffsetsCount;
+ const int32_t * rootOffsets;
+ int32_t rootOffsetsCount;
+} TailoredBreakItem;
+
+#define ARRAY_PTR_LEN(array) (array),(sizeof(array)/sizeof(array[0]))
+
+static const TailoredBreakItem tbItems[] = {
+ { UBRK_WORD, "en_US_POSIX", posxWordText, ARRAY_PTR_LEN(posxWordTOffsets), ARRAY_PTR_LEN(posxWordROffsets) },
+ { UBRK_WORD, "ja", jaWordText, ARRAY_PTR_LEN(jaWordTOffsets), ARRAY_PTR_LEN(jaWordROffsets) },
+ { UBRK_SENTENCE, "el", elSentText, ARRAY_PTR_LEN(elSentTOffsets), ARRAY_PTR_LEN(elSentROffsets) },
+ { UBRK_CHARACTER, "th", thCharText, ARRAY_PTR_LEN(thCharTOffsets), ARRAY_PTR_LEN(thCharROffsets) },
+ { UBRK_CHARACTER, NULL, NULL, NULL,0, NULL,0 } // terminator
+};
+
+static void formatOffsets(char* buffer, int32_t buflen, int32_t count, const int32_t* offsets) {
+ while (count-- > 0) {
+ int writeCount;
+ sprintf(buffer, /* buflen, */ " %d%n", *offsets++, &writeCount); /* wants to be snprintf */
+ buffer += writeCount;
+ buflen -= writeCount;
+ }
+}
+
+enum { kMaxOffsetCount = 128 };
+
+void RBBITest::TBTest(BreakIterator* brkitr, int type, const char *locale, const char* escapedText, const int32_t *expectOffsets, int32_t expectOffsetsCount) {
+ brkitr->setText( CharsToUnicodeString(escapedText) );
+ int32_t foundOffsets[kMaxOffsetCount];
+ int32_t offset, foundOffsetsCount = 0;
+ // do forwards iteration test
+ while ( foundOffsetsCount < kMaxOffsetCount && (offset = brkitr->next()) != BreakIterator::DONE ) {
+ foundOffsets[foundOffsetsCount++] = offset;
+ }
+ if ( foundOffsetsCount != expectOffsetsCount || memcmp(expectOffsets, foundOffsets, foundOffsetsCount*sizeof(foundOffsets[0])) != 0 ) {
+ // log error for forwards test
+ char formatExpect[512], formatFound[512];
+ formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets);
+ formatOffsets(formatFound, sizeof(formatFound), foundOffsetsCount, foundOffsets);
+ errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found %d offsets fwd:%s\n",
+ type, locale, escapedText, expectOffsetsCount, formatExpect, foundOffsetsCount, formatFound);
+ } else {
+ // do backwards iteration test
+ --foundOffsetsCount; // back off one from the end offset
+ while ( foundOffsetsCount > 0 ) {
+ offset = brkitr->previous();
+ if ( offset != foundOffsets[--foundOffsetsCount] ) {
+ // log error for backwards test
+ char formatExpect[512];
+ formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets);
+ errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found rev offset %d where expect %d\n",
+ type, locale, escapedText, expectOffsetsCount, formatExpect, offset, foundOffsets[foundOffsetsCount]);
+ break;
+ }
+ }
+ }
+}
+
+void RBBITest::TestTailoredBreaks() {
+ const TailoredBreakItem * tbItemPtr;
+ Locale rootLocale = Locale("root");
+ for (tbItemPtr = tbItems; tbItemPtr->escapedText != NULL; ++tbItemPtr) {
+ Locale testLocale = Locale(tbItemPtr->locale);
+ BreakIterator * tailoredBrkiter = NULL;
+ BreakIterator * rootBrkiter = NULL;
+ UErrorCode status = U_ZERO_ERROR;
+ switch (tbItemPtr->type) {
+ case UBRK_CHARACTER:
+ tailoredBrkiter = BreakIterator::createCharacterInstance(testLocale, status);
+ rootBrkiter = BreakIterator::createCharacterInstance(rootLocale, status);
+ break;
+ case UBRK_WORD:
+ tailoredBrkiter = BreakIterator::createWordInstance(testLocale, status);
+ rootBrkiter = BreakIterator::createWordInstance(rootLocale, status);
+ break;
+ case UBRK_LINE:
+ tailoredBrkiter = BreakIterator::createLineInstance(testLocale, status);
+ rootBrkiter = BreakIterator::createLineInstance(rootLocale, status);
+ break;
+ case UBRK_SENTENCE:
+ tailoredBrkiter = BreakIterator::createSentenceInstance(testLocale, status);
+ rootBrkiter = BreakIterator::createSentenceInstance(rootLocale, status);
+ break;
+ default:
+ status = U_UNSUPPORTED_ERROR;
+ break;
+ }
+ if (U_FAILURE(status)) {
+ errcheckln(status, "BreakIterator create failed for type %d, locales root or %s - Error: %s", (int)(tbItemPtr->type), tbItemPtr->locale, u_errorName(status));
+ continue;
+ }
+ TBTest(tailoredBrkiter, (int)(tbItemPtr->type), tbItemPtr->locale, tbItemPtr->escapedText, tbItemPtr->tailoredOffsets, tbItemPtr->tailoredOffsetsCount);
+ TBTest(rootBrkiter, (int)(tbItemPtr->type), "root", tbItemPtr->escapedText, tbItemPtr->rootOffsets, tbItemPtr->rootOffsetsCount);
+
+ delete rootBrkiter;
+ delete tailoredBrkiter;
+ }
+}
+
+
+//-------------------------------------------------------------------------------
+//
+// TestDictRules create a break iterator from source rules that includes a
+// dictionary range. Regression for bug #7130. Source rules
+// do not declare a break iterator type (word, line, sentence, etc.
+// but the dictionary code, without a type, would loop.
+//
+//-------------------------------------------------------------------------------
+void RBBITest::TestDictRules() {
+ const char *rules = "$dictionary = [a-z]; \n"
+ "!!forward; \n"
+ "$dictionary $dictionary; \n"
+ "!!reverse; \n"
+ "$dictionary $dictionary; \n";
+ const char *text = "aa";
+ UErrorCode status = U_ZERO_ERROR;
+ UParseError parseError;
+
+ RuleBasedBreakIterator bi(rules, parseError, status);
+ if (U_SUCCESS(status)) {
+ UnicodeString utext = text;
+ bi.setText(utext);
+ int32_t position;
+ int32_t loops;
+ for (loops = 0; loops<10; loops++) {
+ position = bi.next();
+ if (position == RuleBasedBreakIterator::DONE) {
+ break;
+ }
+ }
+ TEST_ASSERT(loops == 1);
+ } else {
+ dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
+ }
+}
+
+