X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/73c04bcfe1096173b00431f0cdc742894b15eef0..4388f060552cc537e71e957d32f35e9d75a61233:/icuSources/common/rbbi.cpp diff --git a/icuSources/common/rbbi.cpp b/icuSources/common/rbbi.cpp index 10216c22..f76d02ae 100644 --- a/icuSources/common/rbbi.cpp +++ b/icuSources/common/rbbi.cpp @@ -1,7 +1,7 @@ /* *************************************************************************** -* Copyright (C) 1999-2006 International Business Machines Corporation * -* and others. All rights reserved. * +* Copyright (C) 1999-2012 International Business Machines Corporation +* and others. All rights reserved. *************************************************************************** */ // @@ -10,6 +10,8 @@ // class RuleBasedBreakIterator // +#include // for 'typeid' to work + #include "unicode/utypes.h" #if !UCONFIG_NO_BREAK_ITERATION @@ -23,7 +25,7 @@ #include "rbbirb.h" #include "cmemory.h" #include "cstring.h" -#include "mutex.h" +#include "umutex.h" #include "ucln_cmn.h" #include "brkeng.h" @@ -41,9 +43,11 @@ static UBool fTrace = FALSE; U_NAMESPACE_BEGIN +// The state number of the starting state +#define START_STATE 1 -static const int16_t START_STATE = 1; // The state number of the starting state -static const int16_t STOP_STATE = 0; // The state-transition value indicating "stop" +// The state-transition value indicating "stop" +#define STOP_STATE 0 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator) @@ -68,6 +72,50 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode } } +/** + * Same as above but does not adopt memory + */ +RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status) +{ + init(); + fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // status checked in constructor + if (U_FAILURE(status)) {return;} + if(fData == 0) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } +} + + +// +// Construct from precompiled binary rules (tables). This constructor is public API, +// taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules(). +// +RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules, + uint32_t ruleLength, + UErrorCode &status) { + init(); + if (U_FAILURE(status)) { + return; + } + if (compiledRules == NULL || ruleLength < sizeof(RBBIDataHeader)) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + const RBBIDataHeader *data = (const RBBIDataHeader *)compiledRules; + if (data->fLength > ruleLength) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); + if (U_FAILURE(status)) {return;} + if(fData == 0) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } +} + + //------------------------------------------------------------------------------- // // Constructor from a UDataMemory handle to precompiled break rules @@ -99,7 +147,7 @@ RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules, init(); if (U_FAILURE(status)) {return;} RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *) - RBBIRuleBuilder::createRuleBasedBreakIterator(rules, parseError, status); + RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status); // Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that // creates and returns a complete RBBI. From here, in a constructor, we // can't just return the object created by the builder factory, hence @@ -231,7 +279,10 @@ void RuleBasedBreakIterator::init() { fLastRuleStatusIndex = 0; fLastStatusIndexValid = TRUE; fDictionaryCharCount = 0; - fBreakType = -1; + fBreakType = UBRK_WORD; // Defaulting BreakType to word gives reasonable + // dictionary behavior for Break Iterators that are + // built from rules. Even better would be the ability to + // declare the type in the rules. fCachedBreakPositions = NULL; fLanguageBreakEngines = NULL; @@ -271,7 +322,7 @@ RuleBasedBreakIterator::clone(void) const { */ UBool RuleBasedBreakIterator::operator==(const BreakIterator& that) const { - if (that.getDynamicClassID() != getDynamicClassID()) { + if (typeid(*this) != typeid(that)) { return FALSE; } @@ -321,8 +372,12 @@ void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) { // we can come to signaling a failure. // (GetText() is obsolete, this failure is sort of OK) if (fDCharIter == NULL) { - static UChar c = 0; + static const UChar c = 0; fDCharIter = new UCharCharacterIterator(&c, 0); + if (fDCharIter == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } } if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { @@ -431,6 +486,37 @@ RuleBasedBreakIterator::setText(const UnicodeString& newText) { } +/** + * Provide a new UText for the input text. Must reference text with contents identical + * to the original. + * Intended for use with text data originating in Java (garbage collected) environments + * where the data may be moved in memory at arbitrary times. + */ +RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) { + if (U_FAILURE(status)) { + return *this; + } + if (input == NULL) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return *this; + } + int64_t pos = utext_getNativeIndex(fText); + // Shallow read-only clone of the new UText into the existing input UText + fText = utext_clone(fText, input, FALSE, TRUE, &status); + if (U_FAILURE(status)) { + return *this; + } + utext_setNativeIndex(fText, pos); + if (utext_getNativeIndex(fText) != pos) { + // Sanity check. The new input utext is supposed to have the exact same + // contents as the old. If we can't set to the same position, it doesn't. + // The contents underlying the old utext might be invalid at this point, + // so it's not safe to check directly. + status = U_ILLEGAL_ARGUMENT_ERROR; + } + return *this; +} + /** * Sets the current iteration position to the beginning of the text. @@ -566,7 +652,7 @@ int32_t RuleBasedBreakIterator::previous(void) { int32_t start = current(); - UTEXT_PREVIOUS32(fText); + (void)UTEXT_PREVIOUS32(fText); int32_t lastResult = handlePrevious(fData->fReverseTable); if (lastResult == UBRK_DONE) { lastResult = 0; @@ -662,7 +748,7 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) { // move forward one codepoint to prepare for moving back to a // safe point. // this handles offset being between a supplementary character - UTEXT_NEXT32(fText); + (void)UTEXT_NEXT32(fText); // handlePrevious will move most of the time to < 1 boundary away handlePrevious(fData->fSafeRevTable); int32_t result = next(); @@ -674,7 +760,7 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) { if (fData->fSafeFwdTable != NULL) { // backup plan if forward safe table is not available utext_setNativeIndex(fText, offset); - UTEXT_PREVIOUS32(fText); + (void)UTEXT_PREVIOUS32(fText); // handle next will give result >= offset handleNext(fData->fSafeFwdTable); // previous will give result 0 or 1 boundary away from offset, @@ -705,7 +791,7 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) { utext_setNativeIndex(fText, offset); if (offset==0 || - offset==1 && utext_getNativeIndex(fText)==0) { + (offset==1 && utext_getNativeIndex(fText)==0)) { return next(); } result = previous(); @@ -774,7 +860,7 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) { // indices to the containing code point. // For breakitereator::preceding only, these non-code-point indices need to be moved // up to refer to the following codepoint. - UTEXT_NEXT32(fText); + (void)UTEXT_NEXT32(fText); offset = (int32_t)UTEXT_GETNATIVEINDEX(fText); } @@ -783,7 +869,7 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) { // (Change would interact with safe rules.) // TODO: change RBBI behavior for off-boundary indices to match that of UText? // affects only preceding(), seems cleaner, but is slightly different. - UTEXT_PREVIOUS32(fText); + (void)UTEXT_PREVIOUS32(fText); handleNext(fData->fSafeFwdTable); int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText); while (result >= offset) { @@ -798,7 +884,7 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) { // if they use safe tables at all. We have certainly never described // to anyone how to work with just one safe table. utext_setNativeIndex(fText, offset); - UTEXT_NEXT32(fText); + (void)UTEXT_NEXT32(fText); // handle previous will give result <= offset handlePrevious(fData->fSafeRevTable); @@ -902,7 +988,7 @@ enum RBBIRunMode { //----------------------------------------------------------------------------------- int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { int32_t state; - int16_t category = 0; + uint16_t category = 0; RBBIRunMode mode; RBBIStateTableRow *row; @@ -997,9 +1083,9 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { } } - #ifdef RBBI_DEBUG + #ifdef RBBI_DEBUG if (fTrace) { - RBBIDebugPrintf(" %4d ", utext_getNativeIndex(fText)); + RBBIDebugPrintf(" %4ld ", utext_getNativeIndex(fText)); if (0x20<=c && c<0x7f) { RBBIDebugPrintf("\"%c\" ", c); } else { @@ -1011,7 +1097,12 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { // State Transition - move machine to its next state // - state = row->fNextState[category]; + + // Note: fNextState is defined as uint16_t[2], but we are casting + // a generated RBBI table to RBBIStateTableRow and some tables + // actually have more than 2 categories. + U_ASSERT(categoryfHeader->fCatCount); + state = row->fNextState[category]; /*Not accessing beyond memory*/ row = (RBBIStateTableRow *) // (statetable->fTableData + (statetable->fRowLen * state)); (tableData + tableRowLen * state); @@ -1034,7 +1125,7 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { lookaheadStatus = 0; // TODO: make a standalone hard break in a rule work. if (lookAheadHardBreak) { - utext_setNativeIndex(fText, result); + UTEXT_SETNATIVEINDEX(fText, result); return result; } // Look-ahead completed, but other rules may match further. Continue on @@ -1085,13 +1176,13 @@ continueOn: // (This really indicates a defect in the break rules. They should always match // at least one character.) if (result == initialPosition) { - utext_setNativeIndex(fText, initialPosition); + UTEXT_SETNATIVEINDEX(fText, initialPosition); UTEXT_NEXT32(fText); result = (int32_t)UTEXT_GETNATIVEINDEX(fText); } // Leave the iterator at our result position. - utext_setNativeIndex(fText, result); + UTEXT_SETNATIVEINDEX(fText, result); #ifdef RBBI_DEBUG if (fTrace) { RBBIDebugPrintf("result = %d\n\n", result); @@ -1114,7 +1205,7 @@ continueOn: //----------------------------------------------------------------------------------- int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) { int32_t state; - int16_t category = 0; + uint16_t category = 0; RBBIRunMode mode; RBBIStateTableRow *row; UChar32 c; @@ -1164,12 +1255,10 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) for (;;) { if (c == U_SENTINEL) { // Reached end of input string. - if (mode == RBBI_END || - *(int32_t *)fData->fHeader->fFormatVersion == 1 ) { + if (mode == RBBI_END) { // We have already run the loop one last time with the // character set to the psueudo {eof} value. Now it is time // to unconditionally bail out. - // (Or we have an old format binary rule file that does not support {eof}.) if (lookaheadResult < result) { // We ran off the end of the string with a pending look-ahead match. // Treat this as if the look-ahead condition had been met, and return @@ -1179,8 +1268,8 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) } else if (result == initialPosition) { // Ran off start, no match found. // move one index one (towards the start, since we are doing a previous()) - utext_setNativeIndex(fText, initialPosition); - UTEXT_PREVIOUS32(fText); // TODO: shouldn't be necessary. We're already at beginning. Check. + UTEXT_SETNATIVEINDEX(fText, initialPosition); + (void)UTEXT_PREVIOUS32(fText); // TODO: shouldn't be necessary. We're already at beginning. Check. } break; } @@ -1228,7 +1317,12 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) // State Transition - move machine to its next state // - state = row->fNextState[category]; + + // Note: fNextState is defined as uint16_t[2], but we are casting + // a generated RBBI table to RBBIStateTableRow and some tables + // actually have more than 2 categories. + U_ASSERT(categoryfHeader->fCatCount); + state = row->fNextState[category]; /*Not accessing beyond memory*/ row = (RBBIStateTableRow *) (statetable->fTableData + (statetable->fRowLen * state)); @@ -1245,7 +1339,7 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) lookaheadStatus = 0; // TODO: make a standalone hard break in a rule work. if (lookAheadHardBreak) { - utext_setNativeIndex(fText, result); + UTEXT_SETNATIVEINDEX(fText, result); return result; } // Look-ahead completed, but other rules may match further. Continue on @@ -1293,13 +1387,13 @@ continueOn: // (This really indicates a defect in the break rules. They should always match // at least one character.) if (result == initialPosition) { - utext_setNativeIndex(fText, initialPosition); + UTEXT_SETNATIVEINDEX(fText, initialPosition); UTEXT_PREVIOUS32(fText); result = (int32_t)UTEXT_GETNATIVEINDEX(fText); } // Leave the iterator at our result position. - utext_setNativeIndex(fText, result); + UTEXT_SETNATIVEINDEX(fText, result); #ifdef RBBI_DEBUG if (fTrace) { RBBIDebugPrintf("result = %d\n\n", result); @@ -1481,19 +1575,8 @@ BreakIterator * RuleBasedBreakIterator::createBufferClone(void *stackBuffer, // // Clone the source BI into the caller-supplied buffer. - // TODO: using an overloaded operator new to directly initialize the - // copy in the user's buffer would be better, but it doesn't seem - // to get along with namespaces. Investigate why. - // - // The memcpy is only safe with an empty (default constructed) - // break iterator. Use on others can screw up reference counts - // to data. memcpy-ing objects is not really a good idea... // - RuleBasedBreakIterator localIter; // Empty break iterator, source for memcpy - RuleBasedBreakIterator *clone = (RuleBasedBreakIterator *)buf; - uprv_memcpy(clone, &localIter, sizeof(RuleBasedBreakIterator)); // init C++ gorp, BreakIterator base class part - clone->init(); // Init RuleBasedBreakIterator part, (user default constructor) - *clone = *this; // clone = the real BI we want. + RuleBasedBreakIterator *clone = new(buf) RuleBasedBreakIterator(*this); clone->fBufferClone = TRUE; // Flag to prevent deleting storage on close (From C code) return clone; @@ -1539,6 +1622,30 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos, return (reverse ? startPos : endPos); } + // Bug 5532. The dictionary code will crash if the input text is UTF-8 + // because native indexes are different from UTF-16 indexes. + // Temporary hack: skip dictionary lookup for UTF-8 encoded text. + // It wont give the right breaks, but it's better than a crash. + // + // Check the type of the UText by checking its pFuncs field, which + // is UText's function dispatch table. It will be the same for all + // UTF-8 UTexts and different for any other UText type. + // + // We have no other type of UText available with non-UTF-16 native indexing. + // This whole check will go away once the dictionary code is fixed. + static const void *utext_utf8Funcs; + if (utext_utf8Funcs == NULL) { + // Cache the UTF-8 UText function pointer value. + UErrorCode status = U_ZERO_ERROR; + UText tempUText = UTEXT_INITIALIZER; + utext_openUTF8(&tempUText, NULL, 0, &status); + utext_utf8Funcs = tempUText.pFuncs; + utext_close(&tempUText); + } + if (fText->pFuncs == utext_utf8Funcs) { + return (reverse ? startPos : endPos); + } + // Starting from the starting point, scan towards the proposed result, // looking for the first dictionary character (which may be the one // we're on, if we're starting in the middle of a range). @@ -1679,12 +1786,12 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos, return (reverse ? startPos : endPos); } -static UStack *gLanguageBreakFactories = NULL; - U_NAMESPACE_END // defined in ucln_cmn.h +static icu::UStack *gLanguageBreakFactories = NULL; + /** * Release all static memory held by breakiterator. */ @@ -1700,7 +1807,7 @@ U_CDECL_END U_CDECL_BEGIN static void U_CALLCONV _deleteFactory(void *obj) { - delete (LanguageBreakFactory *) obj; + delete (icu::LanguageBreakFactory *) obj; } U_CDECL_END U_NAMESPACE_BEGIN @@ -1710,13 +1817,11 @@ getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType) { UBool needsInit; UErrorCode status = U_ZERO_ERROR; - umtx_lock(NULL); - needsInit = (UBool)(gLanguageBreakFactories == NULL); - umtx_unlock(NULL); + UMTX_CHECK(NULL, (UBool)(gLanguageBreakFactories == NULL), needsInit); if (needsInit) { UStack *factories = new UStack(_deleteFactory, NULL, status); - if (U_SUCCESS(status)) { + if (factories != NULL && U_SUCCESS(status)) { ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status); factories->push(builtIn, status); #ifdef U_LOCAL_SERVICE_HOOK @@ -1766,7 +1871,7 @@ RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) { if (fLanguageBreakEngines == NULL) { fLanguageBreakEngines = new UStack(status); - if (U_FAILURE(status)) { + if (fLanguageBreakEngines == NULL || U_FAILURE(status)) { delete fLanguageBreakEngines; fLanguageBreakEngines = 0; return NULL;