/********************************************************************
* COPYRIGHT:
- * Copyright (c) 1999-2013, International Business Machines Corporation and
+ * Copyright (c) 1999-2015, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
/************************************************************************
#include "intltest.h"
#include "rbbitst.h"
#include <string.h>
+#include "charstr.h"
#include "uvector.h"
#include "uvectr32.h"
-#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include "unicode/numfmt.h"
#include "unicode/uscript.h"
+#include "cmemory.h"
#define TEST_ASSERT(x) {if (!(x)) { \
errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
UErrorCode status=U_ZERO_ERROR;
UParseError parseError;
- RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
+ BreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
if(U_FAILURE(status)) {
dataerrln("FAIL : in construction - %s", u_errorName(status));
} else {
}
-static void printStringBreaks(UnicodeString ustr, int expected[],
- int expectedcount)
-{
+static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
UErrorCode status = U_ZERO_ERROR;
char name[100];
printf("code alpha extend alphanum type word sent line name\n");
- int j;
- for (j = 0; j < ustr.length(); j ++) {
- if (expectedcount > 0) {
- int k;
- for (k = 0; k < expectedcount; k ++) {
- if (j == expected[k]) {
- printf("------------------------------------------------ %d\n",
- j);
- }
- }
- }
- UChar32 c = ustr.char32At(j);
- if (c > 0xffff) {
- j ++;
+ int nextExpectedIndex = 0;
+ utext_setNativeIndex(tstr, 0);
+ for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) {
+ if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
+ printf("------------------------------------------------ %d\n", j);
+ ++nextExpectedIndex;
}
+
+ UChar32 c = utext_next32(tstr);
u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
u_isUAlphabetic(c),
}
+static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
+ UErrorCode status = U_ZERO_ERROR;
+ UText *tstr = NULL;
+ tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
+ if (U_FAILURE(status)) {
+ printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
+ return;
+ }
+ printStringBreaks(tstr, expected, expectedCount);
+ utext_close(tstr);
+}
+
+
void RBBITest::TestBug3818() {
UErrorCode status = U_ZERO_ERROR;
0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
UnicodeString thaiStr(thaiWordData);
- RuleBasedBreakIterator* bi =
- (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status);
+ BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
if (U_FAILURE(status) || bi == NULL) {
errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
return;
//------------------------------------------------------------------------------
struct TestParams {
- BreakIterator *bi;
- UnicodeString dataToBreak;
- UVector32 *expectedBreaks;
- UVector32 *srcLine;
+ BreakIterator *bi; // Break iterator is set while parsing test source.
+ // Changed out whenever test data changes break type.
+
+ UnicodeString dataToBreak; // Data that is built up while parsing the test.
+ UVector32 *expectedBreaks; // Expected break positions, matches dataToBreak UnicodeString.
+ UVector32 *srcLine; // Positions in source file, indexed same as dataToBreak.
UVector32 *srcCol;
+
+ UText *textToBreak; // UText, could be UTF8 or UTF16.
+ UVector32 *textMap; // Map from UTF-16 dataToBreak offsets to UText offsets.
+ CharString utf8String; // UTF-8 form of text to break.
+
+ TestParams(UErrorCode &status) : dataToBreak() {
+ bi = NULL;
+ expectedBreaks = new UVector32(status);
+ srcLine = new UVector32(status);
+ srcCol = new UVector32(status);
+ textToBreak = NULL;
+ textMap = new UVector32(status);
+ }
+
+ ~TestParams() {
+ delete bi;
+ delete expectedBreaks;
+ delete srcLine;
+ delete srcCol;
+ utext_close(textToBreak);
+ delete textMap;
+ }
+
+ int32_t getSrcLine(int32_t bp);
+ int32_t getExpectedBreak(int32_t bp);
+ int32_t getSrcCol(int32_t bp);
+
+ void setUTF16(UErrorCode &status);
+ void setUTF8(UErrorCode &status);
};
-void RBBITest::executeTest(TestParams *t) {
+// Append a UnicodeString to a CharString with UTF-8 encoding.
+// Substitute any invalid chars.
+// Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
+static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
+ if (U_FAILURE(status)) {
+ return;
+ }
+ int32_t utf8Length;
+ u_strToUTF8WithSub(NULL, 0, &utf8Length, // Output Buffer, NULL for preflight.
+ src.getBuffer(), src.length(), // UTF-16 data
+ 0xfffd, NULL, // Substitution char, number of subs.
+ &status);
+ if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
+ return;
+ }
+ status = U_ZERO_ERROR;
+ int32_t capacity;
+ char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
+ u_strToUTF8WithSub(buffer, utf8Length, NULL,
+ src.getBuffer(), src.length(),
+ 0xfffd, NULL, &status);
+ dest.append(buffer, utf8Length, status);
+}
+
+
+void TestParams::setUTF16(UErrorCode &status) {
+ textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
+ textMap->removeAllElements();
+ for (int32_t i=0; i<dataToBreak.length(); i++) {
+ if (i == dataToBreak.getChar32Start(i)) {
+ textMap->addElement(i, status);
+ } else {
+ textMap->addElement(-1, status);
+ }
+ }
+ textMap->addElement(dataToBreak.length(), status);
+ U_ASSERT(dataToBreak.length() + 1 == textMap->size());
+}
+
+
+void TestParams::setUTF8(UErrorCode &status) {
+ if (U_FAILURE(status)) {
+ return;
+ }
+ utf8String.clear();
+ CharStringAppend(utf8String, dataToBreak, status);
+ textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
+ if (U_FAILURE(status)) {
+ return;
+ }
+
+ textMap->removeAllElements();
+ int32_t utf16Index = 0;
+ for (;;) {
+ textMap->addElement(utf16Index, status);
+ UChar32 c32 = utext_current32(textToBreak);
+ if (c32 < 0) {
+ break;
+ }
+ utf16Index += U16_LENGTH(c32);
+ utext_next32(textToBreak);
+ while (textMap->size() < utext_getNativeIndex(textToBreak)) {
+ textMap->addElement(-1, status);
+ }
+ }
+ U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
+}
+
+
+int32_t TestParams::getSrcLine(int bp) {
+ if (bp >= textMap->size()) {
+ bp = textMap->size() - 1;
+ }
+ int32_t i = 0;
+ for(; bp >= 0 ; --bp) {
+ // Move to a character boundary if we are not on one already.
+ i = textMap->elementAti(bp);
+ if (i >= 0) {
+ break;
+ }
+ }
+ return srcLine->elementAti(i);
+}
+
+
+int32_t TestParams::getExpectedBreak(int bp) {
+ if (bp >= textMap->size()) {
+ return 0;
+ }
+ int32_t i = textMap->elementAti(bp);
+ int32_t retVal = 0;
+ if (i >= 0) {
+ retVal = expectedBreaks->elementAti(i);
+ }
+ return retVal;
+}
+
+
+int32_t TestParams::getSrcCol(int bp) {
+ if (bp >= textMap->size()) {
+ bp = textMap->size() - 1;
+ }
+ int32_t i = 0;
+ for(; bp >= 0; --bp) {
+ // Move bp to a character boundary if we are not on one already.
+ i = textMap->elementAti(bp);
+ if (i >= 0) {
+ break;
+ }
+ }
+ return srcCol->elementAti(i);
+}
+
+
+void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
int32_t bp;
int32_t prevBP;
int32_t i;
+ TEST_ASSERT_SUCCESS(status);
+ if (U_FAILURE(status)) {
+ return;
+ }
+
if (t->bi == NULL) {
return;
}
- t->bi->setText(t->dataToBreak);
+ t->bi->setText(t->textToBreak, status);
//
// Run the iterator forward
//
if (prevBP == bp) {
// Fail for lack of forward progress.
errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
- bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
+ bp, t->getSrcLine(bp), t->getSrcCol(bp));
break;
}
- // Check that there were we didn't miss an expected break between the last one
+ // Check that there we didn't miss an expected break between the last one
// and this one.
for (i=prevBP+1; i<bp; i++) {
- if (t->expectedBreaks->elementAti(i) != 0) {
+ if (t->getExpectedBreak(i) != 0) {
int expected[] = {0, i};
printStringBreaks(t->dataToBreak, expected, 2);
errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
- i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
+ i, t->getSrcLine(i), t->getSrcCol(i));
}
}
// Check that the break we did find was expected
- if (t->expectedBreaks->elementAti(bp) == 0) {
+ if (t->getExpectedBreak(bp) == 0) {
int expected[] = {0, bp};
- printStringBreaks(t->dataToBreak, expected, 2);
+ printStringBreaks(t->textToBreak, expected, 2);
errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
- bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
+ bp, t->getSrcLine(bp), t->getSrcCol(bp));
} else {
// The break was expected.
// Check that the {nnn} tag value is correct.
- int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
+ int32_t expectedTagVal = t->getExpectedBreak(bp);
if (expectedTagVal == -1) {
expectedTagVal = 0;
}
- int32_t line = t->srcLine->elementAti(bp);
+ int32_t line = t->getSrcLine(bp);
int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
if (rs != expectedTagVal) {
errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
" Actual, Expected status = %4d, %4d",
- bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
+ bp, line, t->getSrcCol(bp), rs, expectedTagVal);
}
}
-
prevBP = bp;
}
// Verify that there were no missed expected breaks after the last one found
- for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
- if (t->expectedBreaks->elementAti(i) != 0) {
+ for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
+ if (t->getExpectedBreak(i) != 0) {
errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
- i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
+ i, t->getSrcLine(i), t->getSrcCol(i));
}
}
//
// Run the iterator backwards, verify that the same breaks are found.
//
- prevBP = t->dataToBreak.length()+2; // start with a phony value for the last break pos seen.
+ prevBP = utext_nativeLength(t->textToBreak)+2; // start with a phony value for the last break pos seen.
for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
if (prevBP == bp) {
// Fail for lack of progress.
errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
- bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
+ bp, t->getSrcLine(bp), t->getSrcCol(bp));
break;
}
- // Check that there were we didn't miss an expected break between the last one
+ // Check that we didn't miss an expected break between the last one
// and this one. (UVector returns zeros for index out of bounds.)
for (i=prevBP-1; i>bp; i--) {
- if (t->expectedBreaks->elementAti(i) != 0) {
- errln("Reverse Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
- i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
+ if (t->getExpectedBreak(i) != 0) {
+ errln("Reverse Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
+ i, t->getSrcLine(i), t->getSrcCol(i));
}
}
// Check that the break we did find was expected
- if (t->expectedBreaks->elementAti(bp) == 0) {
+ if (t->getExpectedBreak(bp) == 0) {
errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
- bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
+ bp, t->getSrcLine(bp), t->getSrcCol(bp));
} else {
// The break was expected.
// Check that the {nnn} tag value is correct.
- int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
+ int32_t expectedTagVal = t->getExpectedBreak(bp);
if (expectedTagVal == -1) {
expectedTagVal = 0;
}
- int line = t->srcLine->elementAti(bp);
- int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
+ int line = t->getSrcLine(bp);
+ int32_t rs = t->bi->getRuleStatus();
if (rs != expectedTagVal) {
errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
" Actual, Expected status = %4d, %4d",
- bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
+ bp, line, t->getSrcCol(bp), rs, expectedTagVal);
}
}
// Verify that there were no missed breaks prior to the last one found
for (i=prevBP-1; i>=0; i--) {
- if (t->expectedBreaks->elementAti(i) != 0) {
+ if (t->getExpectedBreak(i) != 0) {
errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
- i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
+ i, t->getSrcLine(i), t->getSrcCol(i));
}
}
// Check isBoundary()
- for (i=0; i<t->expectedBreaks->size(); i++) {
- UBool boundaryExpected = (t->expectedBreaks->elementAti(i) != 0);
+ for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
+ UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
UBool boundaryFound = t->bi->isBoundary(i);
if (boundaryExpected != boundaryFound) {
errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
" Expected, Actual= %s, %s",
- i, t->srcLine->elementAti(i), t->srcCol->elementAti(i),
+ i, t->getSrcLine(i), t->getSrcCol(i),
boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
}
}
// Check following()
- for (i=0; i<t->expectedBreaks->size(); i++) {
+ for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
int32_t actualBreak = t->bi->following(i);
int32_t expectedBreak = BreakIterator::DONE;
- for (int32_t j=i+1; j < t->expectedBreaks->size(); j++) {
- if (t->expectedBreaks->elementAti(j) != 0) {
+ for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) {
+ if (t->getExpectedBreak(j) != 0) {
expectedBreak = j;
break;
}
if (expectedBreak != actualBreak) {
errln("following(%d) incorrect. File line,col= %4d,%4d\n"
" Expected, Actual= %d, %d",
- i, t->srcLine->elementAti(i), t->srcCol->elementAti(i), expectedBreak, actualBreak);
+ i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
}
}
// Check preceding()
- for (i=t->expectedBreaks->size(); i>=0; i--) {
+ for (i=utext_nativeLength(t->textToBreak); i>=0; i--) {
int32_t actualBreak = t->bi->preceding(i);
int32_t expectedBreak = BreakIterator::DONE;
- for (int32_t j=i-1; j >= 0; j--) {
- if (t->expectedBreaks->elementAti(j) != 0) {
+ // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
+ // preceding(trailing byte) will return the index of some preceding code point,
+ // not the lead byte of the current code point, even though that has a smaller index.
+ // Therefore, start looking at the expected break data not at i-1, but at
+ // the start of code point index - 1.
+ utext_setNativeIndex(t->textToBreak, i);
+ int32_t j = utext_getNativeIndex(t->textToBreak) - 1;
+ for (; j >= 0; j--) {
+ if (t->getExpectedBreak(j) != 0) {
expectedBreak = j;
break;
}
if (expectedBreak != actualBreak) {
errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
" Expected, Actual= %d, %d",
- i, t->srcLine->elementAti(i), t->srcCol->elementAti(i), expectedBreak, actualBreak);
+ i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
}
}
}
Locale locale("");
UnicodeString rules;
- TestParams tp;
- tp.bi = NULL;
- tp.expectedBreaks = new UVector32(status);
- tp.srcLine = new UVector32(status);
- tp.srcCol = new UVector32(status);
+ TestParams tp(status);
- RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status);
+ RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_@=-]*) *>"), 0, status);
if (U_FAILURE(status)) {
dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
}
charIdx += 6;
// RUN THE TEST!
- executeTest(&tp);
+ status = U_ZERO_ERROR;
+ tp.setUTF16(status);
+ executeTest(&tp, status);
+ TEST_ASSERT_SUCCESS(status);
+
+ // Run again, this time with UTF-8 text wrapped in a UText.
+ status = U_ZERO_ERROR;
+ tp.setUTF8(status);
+ TEST_ASSERT_SUCCESS(status);
+ executeTest(&tp, status);
break;
}
}
end_test:
- delete tp.bi;
- delete tp.expectedBreaks;
- delete tp.srcLine;
- delete tp.srcCol;
delete [] testFile;
#endif
}
}
+// Check for test cases from the Unicode test data files that are known to fail
+// and should be skipped because ICU is not yet able to fully implement the spec.
+// See ticket #7270.
+
+UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
+ static const UChar badTestCases[][4] = { // Line Numbers from Unicode 7.0.0 file.
+ {(UChar)0x200B, (UChar)0x0020, (UChar)0x007D, (UChar)0x0000}, // Line 5198
+ {(UChar)0x200B, (UChar)0x0020, (UChar)0x0029, (UChar)0x0000}, // Line 5202
+ {(UChar)0x200B, (UChar)0x0020, (UChar)0x0021, (UChar)0x0000}, // Line 5214
+ {(UChar)0x200B, (UChar)0x0020, (UChar)0x002c, (UChar)0x0000}, // Line 5246
+ {(UChar)0x200B, (UChar)0x0020, (UChar)0x002f, (UChar)0x0000}, // Line 5298
+ {(UChar)0x200B, (UChar)0x0020, (UChar)0x2060, (UChar)0x0000} // Line 5302
+ };
+ if (strcmp(fileName, "LineBreakTest.txt") != 0) {
+ return FALSE;
+ }
+
+ for (int i=0; i<UPRV_LENGTHOF(badTestCases); i++) {
+ if (testCase == UnicodeString(badTestCases[i])) {
+ return logKnownIssue("7270");
+ }
+ }
+ return FALSE;
+}
+
+
//--------------------------------------------------------------------------------------------
//
// Run tests from one of the boundary test data files distributed by the Unicode Consortium
//-------------------------------------------------------------------------------------------
void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
- // TODO(andy): Match line break behavior to Unicode 6.0 and remove this time bomb. Ticket #7270
- UBool isTicket7270Fixed = isICUVersionAtLeast(52, 1);
- UBool isLineBreak = 0 == strcmp(fileName, "LineBreakTest.txt");
UErrorCode status = U_ZERO_ERROR;
//
else if (tokenMatcher.start(4, status) >= 0) {
// Scanned to end of a line, possibly skipping over a comment in the process.
// If the line from the file contained test data, run the test now.
- //
- if (testString.length() > 0) {
-// TODO(andy): Remove this time bomb code. Note: Failing line numbers may change when updating to new Unicode data.
-// Rule 8
-// ZW SP* <break>
-// is not yet implemented.
-if (!(isLineBreak && !isTicket7270Fixed && (5198 == lineNumber ||
- 5202 == lineNumber ||
- 5214 == lineNumber ||
- 5246 == lineNumber ||
- 5298 == lineNumber ||
- 5302 == lineNumber ))) {
+ if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
-}
}
// Clear out this test case.
p0 = p1 = p2 = p3 = prevPos;
c3 = fText->char32At(prevPos);
c0 = c1 = c2 = 0;
+ (void)p0; // suppress set but not used warning.
+ (void)c0;
// Loop runs once per "significant" character position in the input text.
for (;;) {
UnicodeSet *fCRSet;
UnicodeSet *fLFSet;
UnicodeSet *fNewlineSet;
+ UnicodeSet *fRegionalIndicatorSet;
UnicodeSet *fKatakanaSet;
+ UnicodeSet *fHebrew_LetterSet;
UnicodeSet *fALetterSet;
// TODO(jungshik): Do we still need this change?
// UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt
+ UnicodeSet *fSingle_QuoteSet;
+ UnicodeSet *fDouble_QuoteSet;
UnicodeSet *fMidNumLetSet;
UnicodeSet *fMidLetterSet;
UnicodeSet *fMidNumSet;
UnicodeSet *fOtherSet;
UnicodeSet *fExtendSet;
UnicodeSet *fExtendNumLetSet;
- UnicodeSet *fRegionalIndicatorSet;
UnicodeSet *fDictionaryCjkSet;
- RegexMatcher *fMatcher;
-
const UnicodeString *fText;
};
"]]",
status);
#endif
- fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
+ fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status);
+ fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status);
+ fHebrew_LetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Hebrew_Letter}]"), status);
+ fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
fALetterSet->removeAll(*fDictionaryCjkSet);
- fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status);
- fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status);
- fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status);
- fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status);
+ fSingle_QuoteSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Single_Quote}]"), status);
+ fDouble_QuoteSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Double_Quote}]"), status);
+ fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status);
+ fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status);
+ fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status);
// TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test
// we should figure out why
- fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status);
- fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status);
- fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
- fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status);
- fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status);
+ fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status);
+ fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status);
+ fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
+ fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status);
fOtherSet = new UnicodeSet();
if(U_FAILURE(status)) {
fOtherSet->removeAll(*fLFSet);
fOtherSet->removeAll(*fNewlineSet);
fOtherSet->removeAll(*fKatakanaSet);
+ fOtherSet->removeAll(*fHebrew_LetterSet);
fOtherSet->removeAll(*fALetterSet);
+ fOtherSet->removeAll(*fSingle_QuoteSet);
+ fOtherSet->removeAll(*fDouble_QuoteSet);
fOtherSet->removeAll(*fMidLetterSet);
fOtherSet->removeAll(*fMidNumSet);
fOtherSet->removeAll(*fNumericSet);
fOtherSet->removeAll(*fDictionaryCjkSet);
fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
- fSets->addElement(fCRSet, status);
- fSets->addElement(fLFSet, status);
- fSets->addElement(fNewlineSet, status);
- fSets->addElement(fALetterSet, status);
- //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test katakana
- fSets->addElement(fMidLetterSet, status);
- fSets->addElement(fMidNumLetSet, status);
- fSets->addElement(fMidNumSet, status);
- fSets->addElement(fNumericSet, status);
- fSets->addElement(fFormatSet, status);
- fSets->addElement(fExtendSet, status);
- fSets->addElement(fOtherSet, status);
- fSets->addElement(fExtendNumLetSet, status);
+ fSets->addElement(fCRSet, status);
+ fSets->addElement(fLFSet, status);
+ fSets->addElement(fNewlineSet, status);
fSets->addElement(fRegionalIndicatorSet, status);
+ fSets->addElement(fHebrew_LetterSet, status);
+ fSets->addElement(fALetterSet, status);
+ fSets->addElement(fSingle_QuoteSet, status);
+ fSets->addElement(fDouble_QuoteSet, status);
+ //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test katakana
+ fSets->addElement(fMidLetterSet, status);
+ fSets->addElement(fMidNumLetSet, status);
+ fSets->addElement(fMidNumSet, status);
+ fSets->addElement(fNumericSet, status);
+ fSets->addElement(fFormatSet, status);
+ fSets->addElement(fExtendSet, status);
+ fSets->addElement(fOtherSet, status);
+ fSets->addElement(fExtendNumLetSet, status);
if (U_FAILURE(status)) {
deferredStatus = status;
p0 = p1 = p2 = p3 = prevPos;
c3 = fText->char32At(prevPos);
c0 = c1 = c2 = 0;
+ (void)p0; // Suppress set but not used warning.
// Loop runs once per "significant" character position in the input text.
for (;;) {
break;
};
- // Rule (5). ALetter x ALetter
- if (fALetterSet->contains(c1) &&
- fALetterSet->contains(c2)) {
+ // Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
+ if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
+ (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
continue;
}
- // Rule (6) ALetter x (MidLetter | MidNumLet) ALetter
+ // Rule (6) (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
//
- if ( fALetterSet->contains(c1) &&
- (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2)) &&
- fALetterSet->contains(c3)) {
+ if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
+ (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
+ (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
continue;
}
+ // Rule (7) (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter)
+ if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
+ (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
+ (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
+ continue;
+ }
+
+ // Rule (7a) Hebrew_Letter x Single_Quote
+ if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
+ continue;
+ }
+
+ // Rule (7b) Hebrew_Letter x Double_Quote Hebrew_Letter
+ if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
+ continue;
+ }
- // Rule (7) ALetter (MidLetter | MidNumLet) x ALetter
- if (fALetterSet->contains(c0) &&
- (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1)) &&
- fALetterSet->contains(c2)) {
+ // Rule (7c) Hebrew_Letter Double_Quote x Hebrew_Letter
+ if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
continue;
}
continue;
}
- // Rule (9) ALetter x Numeric
- if (fALetterSet->contains(c1) &&
+ // Rule (9) (ALetter | Hebrew_Letter) x Numeric
+ if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
fNumericSet->contains(c2)) {
continue;
}
- // Rule (10) Numeric x ALetter
+ // Rule (10) Numeric x (ALetter | Hebrew_Letter)
if (fNumericSet->contains(c1) &&
- fALetterSet->contains(c2)) {
+ (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
continue;
}
- // Rule (11) Numeric (MidNum | MidNumLet) x Numeric
+ // Rule (11) Numeric (MidNum | MidNumLet | Single_Quote) x Numeric
if (fNumericSet->contains(c0) &&
- (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1)) &&
+ (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
fNumericSet->contains(c2)) {
continue;
}
- // Rule (12) Numeric x (MidNum | MidNumLet) Numeric
+ // Rule (12) Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
if (fNumericSet->contains(c1) &&
- (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2)) &&
+ (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
fNumericSet->contains(c3)) {
continue;
}
continue;
}
- // Rule 13a
- if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) ||
+ // Rule 13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
+ if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
fExtendNumLetSet->contains(c2)) {
continue;
}
- // Rule 13b
+ // Rule 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
if (fExtendNumLetSet->contains(c1) &&
- (fALetterSet->contains(c2) || fNumericSet->contains(c2) ||
- fKatakanaSet->contains(c2))) {
- continue;
+ (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
+ fNumericSet->contains(c2) || fKatakanaSet->contains(c2))) {
+ continue;
}
// Rule 13c
delete fLFSet;
delete fNewlineSet;
delete fKatakanaSet;
+ delete fHebrew_LetterSet;
delete fALetterSet;
+ delete fSingle_QuoteSet;
+ delete fDouble_QuoteSet;
delete fMidNumLetSet;
delete fMidLetterSet;
delete fMidNumSet;
p0 = p1 = p2 = p3 = prevPos;
c3 = fText->char32At(prevPos);
c0 = c1 = c2 = 0;
+ (void)p0; // Suppress set but not used warning.
// Loop runs once per "significant" character position in the input text.
for (;;) {
UnicodeSet *fSA;
UnicodeSet *fXX;
- BreakIterator *fCharBI;
-
+ BreakIterator *fCharBI;
const UnicodeString *fText;
- int32_t *fOrigPositions;
-
RegexMatcher *fNumberMatcher;
- RegexMatcher *fLB11Matcher;
};
continue;
}
- // LB 21b - Added for Apple 13927604
+ // LB 21b
+ // SY x HL
if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
continue;
}
expectedBreaks[breakPos] = 1;
U_ASSERT(expectedCount<testText.length());
expected[expectedCount ++] = breakPos;
+ (void)expected; // Set but not used warning.
+ // TODO (andy): check it out.
}
// Find the break positions using forward iteration
LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
BreakIterator::createWordInstance(Locale::getRoot(), status)));
TEST_ASSERT_SUCCESS(status);
+ LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
+ BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
+ TEST_ASSERT_SUCCESS(status);
if (U_FAILURE(status)) {
return;
}
+ int32_t offset, rstatus, iterationCount;
+
brkiter->setText(text);
- int32_t offset, rstatus;
brkiter->last();
- int32_t iterationCount = 0;
+ iterationCount = 0;
while ( (offset = brkiter->previous()) != UBRK_DONE ) {
iterationCount++;
rstatus = brkiter->getRuleStatus();
- // printf(" %d(%d)", offset, rstatus);
+ (void)rstatus; // Suppress set but not used warning.
+ if (iterationCount >= 10) {
+ break;
+ }
+ }
+ TEST_ASSERT(iterationCount == 6);
+
+ brkiterPOSIX->setText(text);
+ brkiterPOSIX->last();
+ iterationCount = 0;
+ while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
+ iterationCount++;
+ rstatus = brkiterPOSIX->getRuleStatus();
+ (void)rstatus; // Suppress set but not used warning.
if (iterationCount >= 10) {
break;
}