]> git.saurik.com Git - apple/icu.git/blobdiff - icuSources/common/rbbi.cpp
ICU-491.11.1.tar.gz
[apple/icu.git] / icuSources / common / rbbi.cpp
index 10216c227daf060d1b317d009843760b22e92b0d..f76d02ae294c2aaf3593639661b2196e5b9fb1df 100644 (file)
@@ -1,7 +1,7 @@
 /*
 ***************************************************************************
-*   Copyright (C) 1999-2006 International Business Machines Corporation   *
-*   and others. All rights reserved.                                      *
+*   Copyright (C) 1999-2012 International Business Machines Corporation
+*   and others. All rights reserved.
 ***************************************************************************
 */
 //
@@ -10,6 +10,8 @@
 //                   class RuleBasedBreakIterator
 //
 
+#include <typeinfo>  // for 'typeid' to work
+
 #include "unicode/utypes.h"
 
 #if !UCONFIG_NO_BREAK_ITERATION
@@ -23,7 +25,7 @@
 #include "rbbirb.h"
 #include "cmemory.h"
 #include "cstring.h"
-#include "mutex.h"
+#include "umutex.h"
 #include "ucln_cmn.h"
 #include "brkeng.h"
 
@@ -41,9 +43,11 @@ static UBool fTrace = FALSE;
 
 U_NAMESPACE_BEGIN
 
+// The state number of the starting state
+#define START_STATE 1
 
-static const int16_t START_STATE = 1;     // The state number of the starting state
-static const int16_t STOP_STATE  = 0;     // The state-transition value indicating "stop"
+// The state-transition value indicating "stop"
+#define STOP_STATE  0
 
 
 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
@@ -68,6 +72,50 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode
     }
 }
 
+/**
+ * Same as above but does not adopt memory
+ */
+RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status)
+{
+    init();
+    fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // status checked in constructor
+    if (U_FAILURE(status)) {return;}
+    if(fData == 0) {
+        status = U_MEMORY_ALLOCATION_ERROR;
+        return;
+    }
+}
+
+
+//
+//  Construct from precompiled binary rules (tables).  This constructor is public API,
+//  taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules().
+//
+RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
+                       uint32_t       ruleLength,
+                       UErrorCode     &status) {
+    init();
+    if (U_FAILURE(status)) {
+        return;
+    }
+    if (compiledRules == NULL || ruleLength < sizeof(RBBIDataHeader)) {
+        status = U_ILLEGAL_ARGUMENT_ERROR;
+        return;
+    }
+    const RBBIDataHeader *data = (const RBBIDataHeader *)compiledRules;
+    if (data->fLength > ruleLength) {
+        status = U_ILLEGAL_ARGUMENT_ERROR;
+        return;
+    }
+    fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); 
+    if (U_FAILURE(status)) {return;}
+    if(fData == 0) {
+        status = U_MEMORY_ALLOCATION_ERROR;
+        return;
+    }
+}    
+
+
 //-------------------------------------------------------------------------------
 //
 //   Constructor   from a UDataMemory handle to precompiled break rules
@@ -99,7 +147,7 @@ RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString  &rules,
     init();
     if (U_FAILURE(status)) {return;}
     RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
-        RBBIRuleBuilder::createRuleBasedBreakIterator(rules, parseError, status);
+        RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status);
     // Note:  This is a bit awkward.  The RBBI ruleBuilder has a factory method that
     //        creates and returns a complete RBBI.  From here, in a constructor, we
     //        can't just return the object created by the builder factory, hence
@@ -231,7 +279,10 @@ void RuleBasedBreakIterator::init() {
     fLastRuleStatusIndex  = 0;
     fLastStatusIndexValid = TRUE;
     fDictionaryCharCount  = 0;
-    fBreakType            = -1;
+    fBreakType            = UBRK_WORD;  // Defaulting BreakType to word gives reasonable
+                                        //   dictionary behavior for Break Iterators that are
+                                        //   built from rules.  Even better would be the ability to
+                                        //   declare the type in the rules.
 
     fCachedBreakPositions    = NULL;
     fLanguageBreakEngines    = NULL;
@@ -271,7 +322,7 @@ RuleBasedBreakIterator::clone(void) const {
  */
 UBool
 RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
-    if (that.getDynamicClassID() != getDynamicClassID()) {
+    if (typeid(*this) != typeid(that)) {
         return FALSE;
     }
 
@@ -321,8 +372,12 @@ void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
     //   we can come to signaling a failure.
     //   (GetText() is obsolete, this failure is sort of OK)
     if (fDCharIter == NULL) {
-        static UChar c = 0;
+        static const UChar c = 0;
         fDCharIter = new UCharCharacterIterator(&c, 0);
+        if (fDCharIter == NULL) {
+            status = U_MEMORY_ALLOCATION_ERROR;
+            return;
+        }
     }
 
     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
@@ -431,6 +486,37 @@ RuleBasedBreakIterator::setText(const UnicodeString& newText) {
 }
 
 
+/**
+ *  Provide a new UText for the input text.  Must reference text with contents identical
+ *  to the original.
+ *  Intended for use with text data originating in Java (garbage collected) environments
+ *  where the data may be moved in memory at arbitrary times.
+ */
+RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) {
+    if (U_FAILURE(status)) {
+        return *this;
+    }
+    if (input == NULL) {
+        status = U_ILLEGAL_ARGUMENT_ERROR;
+        return *this;
+    }
+    int64_t pos = utext_getNativeIndex(fText);
+    //  Shallow read-only clone of the new UText into the existing input UText
+    fText = utext_clone(fText, input, FALSE, TRUE, &status);
+    if (U_FAILURE(status)) {
+        return *this;
+    }
+    utext_setNativeIndex(fText, pos);
+    if (utext_getNativeIndex(fText) != pos) {
+        // Sanity check.  The new input utext is supposed to have the exact same
+        // contents as the old.  If we can't set to the same position, it doesn't.
+        // The contents underlying the old utext might be invalid at this point,
+        // so it's not safe to check directly.
+        status = U_ILLEGAL_ARGUMENT_ERROR;
+    }
+    return *this;
+}
+
 
 /**
  * Sets the current iteration position to the beginning of the text.
@@ -566,7 +652,7 @@ int32_t RuleBasedBreakIterator::previous(void) {
 
     int32_t start = current();
 
-    UTEXT_PREVIOUS32(fText);
+    (void)UTEXT_PREVIOUS32(fText);
     int32_t lastResult    = handlePrevious(fData->fReverseTable);
     if (lastResult == UBRK_DONE) {
         lastResult = 0;
@@ -662,7 +748,7 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
         // move forward one codepoint to prepare for moving back to a
         // safe point.
         // this handles offset being between a supplementary character
-        UTEXT_NEXT32(fText);
+        (void)UTEXT_NEXT32(fText);
         // handlePrevious will move most of the time to < 1 boundary away
         handlePrevious(fData->fSafeRevTable);
         int32_t result = next();
@@ -674,7 +760,7 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
     if (fData->fSafeFwdTable != NULL) {
         // backup plan if forward safe table is not available
         utext_setNativeIndex(fText, offset);
-        UTEXT_PREVIOUS32(fText);
+        (void)UTEXT_PREVIOUS32(fText);
         // handle next will give result >= offset
         handleNext(fData->fSafeFwdTable);
         // previous will give result 0 or 1 boundary away from offset,
@@ -705,7 +791,7 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
 
     utext_setNativeIndex(fText, offset);
     if (offset==0 || 
-        offset==1  && utext_getNativeIndex(fText)==0) {
+        (offset==1  && utext_getNativeIndex(fText)==0)) {
         return next();
     }
     result = previous();
@@ -774,7 +860,7 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
             //   indices to the containing code point.
             // For breakitereator::preceding only, these non-code-point indices need to be moved
             //   up to refer to the following codepoint.
-            UTEXT_NEXT32(fText);
+            (void)UTEXT_NEXT32(fText);
             offset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
         }
 
@@ -783,7 +869,7 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
         //        (Change would interact with safe rules.)
         // TODO:  change RBBI behavior for off-boundary indices to match that of UText?
         //        affects only preceding(), seems cleaner, but is slightly different.
-        UTEXT_PREVIOUS32(fText);
+        (void)UTEXT_PREVIOUS32(fText);
         handleNext(fData->fSafeFwdTable);
         int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
         while (result >= offset) {
@@ -798,7 +884,7 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
         //            if they use safe tables at all.  We have certainly never described
         //            to anyone how to work with just one safe table.
         utext_setNativeIndex(fText, offset);
-        UTEXT_NEXT32(fText);
+        (void)UTEXT_NEXT32(fText);
         
         // handle previous will give result <= offset
         handlePrevious(fData->fSafeRevTable);
@@ -902,7 +988,7 @@ enum RBBIRunMode {
 //-----------------------------------------------------------------------------------
 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
     int32_t             state;
-    int16_t             category        = 0;
+    uint16_t            category        = 0;
     RBBIRunMode         mode;
     
     RBBIStateTableRow  *row;
@@ -997,9 +1083,9 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
             }
         }
 
-        #ifdef RBBI_DEBUG
+       #ifdef RBBI_DEBUG
             if (fTrace) {
-                RBBIDebugPrintf("             %4d   ", utext_getNativeIndex(fText));
+                RBBIDebugPrintf("             %4ld   ", utext_getNativeIndex(fText));
                 if (0x20<=c && c<0x7f) {
                     RBBIDebugPrintf("\"%c\"  ", c);
                 } else {
@@ -1011,7 +1097,12 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
 
         // State Transition - move machine to its next state
         //
-        state = row->fNextState[category];
+
+        // Note: fNextState is defined as uint16_t[2], but we are casting
+        // a generated RBBI table to RBBIStateTableRow and some tables
+        // actually have more than 2 categories.
+        U_ASSERT(category<fData->fHeader->fCatCount);
+        state = row->fNextState[category];  /*Not accessing beyond memory*/
         row = (RBBIStateTableRow *)
             // (statetable->fTableData + (statetable->fRowLen * state));
             (tableData + tableRowLen * state);
@@ -1034,7 +1125,7 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
                 lookaheadStatus      = 0;
                 // TODO:  make a standalone hard break in a rule work.
                 if (lookAheadHardBreak) {
-                    utext_setNativeIndex(fText, result);
+                    UTEXT_SETNATIVEINDEX(fText, result);
                     return result;
                 }
                 // Look-ahead completed, but other rules may match further.  Continue on
@@ -1085,13 +1176,13 @@ continueOn:
     //   (This really indicates a defect in the break rules.  They should always match
     //    at least one character.)
     if (result == initialPosition) {
-        utext_setNativeIndex(fText, initialPosition);
+        UTEXT_SETNATIVEINDEX(fText, initialPosition);
         UTEXT_NEXT32(fText);
         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
     }
 
     // Leave the iterator at our result position.
-    utext_setNativeIndex(fText, result);
+    UTEXT_SETNATIVEINDEX(fText, result);
     #ifdef RBBI_DEBUG
         if (fTrace) {
             RBBIDebugPrintf("result = %d\n\n", result);
@@ -1114,7 +1205,7 @@ continueOn:
 //-----------------------------------------------------------------------------------
 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) {
     int32_t             state;
-    int16_t             category        = 0;
+    uint16_t            category        = 0;
     RBBIRunMode         mode;
     RBBIStateTableRow  *row;
     UChar32             c;
@@ -1164,12 +1255,10 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
     for (;;) {
         if (c == U_SENTINEL) {
             // Reached end of input string.
-            if (mode == RBBI_END || 
-                *(int32_t *)fData->fHeader->fFormatVersion == 1 ) {
+            if (mode == RBBI_END) {
                 // We have already run the loop one last time with the 
                 //   character set to the psueudo {eof} value.  Now it is time
                 //   to unconditionally bail out.
-                //  (Or we have an old format binary rule file that does not support {eof}.)
                 if (lookaheadResult < result) {
                     // We ran off the end of the string with a pending look-ahead match.
                     // Treat this as if the look-ahead condition had been met, and return
@@ -1179,8 +1268,8 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
                 } else if (result == initialPosition) {
                     // Ran off start, no match found.
                     // move one index one (towards the start, since we are doing a previous())
-                    utext_setNativeIndex(fText, initialPosition);
-                    UTEXT_PREVIOUS32(fText);   // TODO:  shouldn't be necessary.  We're already at beginning.  Check.
+                    UTEXT_SETNATIVEINDEX(fText, initialPosition);
+                    (void)UTEXT_PREVIOUS32(fText);   // TODO:  shouldn't be necessary.  We're already at beginning.  Check.
                 }
                 break;
             }
@@ -1228,7 +1317,12 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
 
         // State Transition - move machine to its next state
         //
-        state = row->fNextState[category];
+
+        // Note: fNextState is defined as uint16_t[2], but we are casting
+        // a generated RBBI table to RBBIStateTableRow and some tables
+        // actually have more than 2 categories.
+        U_ASSERT(category<fData->fHeader->fCatCount);
+        state = row->fNextState[category];  /*Not accessing beyond memory*/
         row = (RBBIStateTableRow *)
             (statetable->fTableData + (statetable->fRowLen * state));
 
@@ -1245,7 +1339,7 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
                 lookaheadStatus      = 0;
                 // TODO:  make a standalone hard break in a rule work.
                 if (lookAheadHardBreak) {
-                    utext_setNativeIndex(fText, result);
+                    UTEXT_SETNATIVEINDEX(fText, result);
                     return result;
                 }
                 // Look-ahead completed, but other rules may match further.  Continue on
@@ -1293,13 +1387,13 @@ continueOn:
     //   (This really indicates a defect in the break rules.  They should always match
     //    at least one character.)
     if (result == initialPosition) {
-        utext_setNativeIndex(fText, initialPosition);
+        UTEXT_SETNATIVEINDEX(fText, initialPosition);
         UTEXT_PREVIOUS32(fText);
         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
     }
 
     // Leave the iterator at our result position.
-    utext_setNativeIndex(fText, result);
+    UTEXT_SETNATIVEINDEX(fText, result);
     #ifdef RBBI_DEBUG
         if (fTrace) {
             RBBIDebugPrintf("result = %d\n\n", result);
@@ -1481,19 +1575,8 @@ BreakIterator *  RuleBasedBreakIterator::createBufferClone(void *stackBuffer,
 
     //
     //  Clone the source BI into the caller-supplied buffer.
-    //    TODO:  using an overloaded operator new to directly initialize the
-    //           copy in the user's buffer would be better, but it doesn't seem
-    //           to get along with namespaces.  Investigate why.
-    //
-    //           The memcpy is only safe with an empty (default constructed)
-    //           break iterator.  Use on others can screw up reference counts
-    //           to data.  memcpy-ing objects is not really a good idea...
     //
-    RuleBasedBreakIterator localIter;        // Empty break iterator, source for memcpy
-    RuleBasedBreakIterator *clone = (RuleBasedBreakIterator *)buf;
-    uprv_memcpy(clone, &localIter, sizeof(RuleBasedBreakIterator)); // init C++ gorp, BreakIterator base class part
-    clone->init();                // Init RuleBasedBreakIterator part, (user default constructor)
-    *clone = *this;               // clone = the real BI we want.
+    RuleBasedBreakIterator *clone = new(buf) RuleBasedBreakIterator(*this);
     clone->fBufferClone = TRUE;   // Flag to prevent deleting storage on close (From C code)
 
     return clone;
@@ -1539,6 +1622,30 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
         return (reverse ? startPos : endPos);
     }
     
+    // Bug 5532.  The dictionary code will crash if the input text is UTF-8
+    //      because native indexes are different from UTF-16 indexes.
+    //      Temporary hack: skip dictionary lookup for UTF-8 encoded text.
+    //      It wont give the right breaks, but it's better than a crash.
+    //
+    //      Check the type of the UText by checking its pFuncs field, which
+    //      is UText's function dispatch table.  It will be the same for all
+    //      UTF-8 UTexts and different for any other UText type.
+    //
+    //      We have no other type of UText available with non-UTF-16 native indexing.
+    //      This whole check will go away once the dictionary code is fixed.
+    static const void *utext_utf8Funcs;
+    if (utext_utf8Funcs == NULL) {
+        // Cache the UTF-8 UText function pointer value.
+        UErrorCode status = U_ZERO_ERROR;
+        UText tempUText = UTEXT_INITIALIZER; 
+        utext_openUTF8(&tempUText, NULL, 0, &status);
+        utext_utf8Funcs = tempUText.pFuncs;
+        utext_close(&tempUText);
+    }
+    if (fText->pFuncs == utext_utf8Funcs) {
+        return (reverse ? startPos : endPos);
+    }
+
     // Starting from the starting point, scan towards the proposed result,
     // looking for the first dictionary character (which may be the one
     // we're on, if we're starting in the middle of a range).
@@ -1679,12 +1786,12 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
     return (reverse ? startPos : endPos);
 }
 
-static UStack *gLanguageBreakFactories = NULL;
-
 U_NAMESPACE_END
 
 // defined in ucln_cmn.h
 
+static icu::UStack *gLanguageBreakFactories = NULL;
+
 /**
  * Release all static memory held by breakiterator.  
  */
@@ -1700,7 +1807,7 @@ U_CDECL_END
 
 U_CDECL_BEGIN
 static void U_CALLCONV _deleteFactory(void *obj) {
-    delete (LanguageBreakFactory *) obj;
+    delete (icu::LanguageBreakFactory *) obj;
 }
 U_CDECL_END
 U_NAMESPACE_BEGIN
@@ -1710,13 +1817,11 @@ getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
 {
     UBool       needsInit;
     UErrorCode  status = U_ZERO_ERROR;
-    umtx_lock(NULL);
-    needsInit = (UBool)(gLanguageBreakFactories == NULL);
-    umtx_unlock(NULL);
+    UMTX_CHECK(NULL, (UBool)(gLanguageBreakFactories == NULL), needsInit);
     
     if (needsInit) {
         UStack  *factories = new UStack(_deleteFactory, NULL, status);
-        if (U_SUCCESS(status)) {
+        if (factories != NULL && U_SUCCESS(status)) {
             ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
             factories->push(builtIn, status);
 #ifdef U_LOCAL_SERVICE_HOOK
@@ -1766,7 +1871,7 @@ RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
     
     if (fLanguageBreakEngines == NULL) {
         fLanguageBreakEngines = new UStack(status);
-        if (U_FAILURE(status)) {
+        if (fLanguageBreakEngines == NULL || U_FAILURE(status)) {
             delete fLanguageBreakEngines;
             fLanguageBreakEngines = 0;
             return NULL;