]> git.saurik.com Git - apple/icu.git/blobdiff - icuSources/common/rbbi.cpp
ICU-57132.0.1.tar.gz
[apple/icu.git] / icuSources / common / rbbi.cpp
index b06a01c09360d08ed13996681209a62207564407..43694734b2ba2a10d9d15ca994e9b1b917bf2204 100644 (file)
@@ -1,7 +1,7 @@
 /*
 ***************************************************************************
-*   Copyright (C) 1999-2008 International Business Machines Corporation   *
-*   and others. All rights reserved.                                      *
+*   Copyright (C) 1999-2016 International Business Machines Corporation
+*   and others. All rights reserved.
 ***************************************************************************
 */
 //
@@ -10,6 +10,8 @@
 //                   class RuleBasedBreakIterator
 //
 
+#include "utypeinfo.h"  // for 'typeid' to work
+
 #include "unicode/utypes.h"
 
 #if !UCONFIG_NO_BREAK_ITERATION
@@ -84,6 +86,36 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum
     }
 }
 
+
+//
+//  Construct from precompiled binary rules (tables).  This constructor is public API,
+//  taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules().
+//
+RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
+                       uint32_t       ruleLength,
+                       UErrorCode     &status) {
+    init();
+    if (U_FAILURE(status)) {
+        return;
+    }
+    if (compiledRules == NULL || ruleLength < sizeof(RBBIDataHeader)) {
+        status = U_ILLEGAL_ARGUMENT_ERROR;
+        return;
+    }
+    const RBBIDataHeader *data = (const RBBIDataHeader *)compiledRules;
+    if (data->fLength > ruleLength) {
+        status = U_ILLEGAL_ARGUMENT_ERROR;
+        return;
+    }
+    fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); 
+    if (U_FAILURE(status)) {return;}
+    if(fData == 0) {
+        status = U_MEMORY_ALLOCATION_ERROR;
+        return;
+    }
+}    
+
+
 //-------------------------------------------------------------------------------
 //
 //   Constructor   from a UDataMemory handle to precompiled break rules
@@ -195,6 +227,7 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
     if (this == &that) {
         return *this;
     }
+    fKeepAll = that.fKeepAll;
     reset();    // Delete break cache information
     fBreakType = that.fBreakType;
     if (fLanguageBreakEngines != NULL) {
@@ -238,7 +271,6 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
 //-----------------------------------------------------------------------------
 void RuleBasedBreakIterator::init() {
     UErrorCode  status    = U_ZERO_ERROR;
-    fBufferClone          = FALSE;
     fText                 = utext_openUChars(NULL, NULL, 0, &status);
     fCharIter             = NULL;
     fSCharIter            = NULL;
@@ -247,7 +279,10 @@ void RuleBasedBreakIterator::init() {
     fLastRuleStatusIndex  = 0;
     fLastStatusIndexValid = TRUE;
     fDictionaryCharCount  = 0;
-    fBreakType            = -1;
+    fBreakType            = UBRK_WORD;  // Defaulting BreakType to word gives reasonable
+                                        //   dictionary behavior for Break Iterators that are
+                                        //   built from rules.  Even better would be the ability to
+                                        //   declare the type in the rules.
 
     fCachedBreakPositions    = NULL;
     fLanguageBreakEngines    = NULL;
@@ -287,11 +322,14 @@ RuleBasedBreakIterator::clone(void) const {
  */
 UBool
 RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
-    if (that.getDynamicClassID() != getDynamicClassID()) {
+    if (typeid(*this) != typeid(that)) {
         return FALSE;
     }
 
     const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;
+    if (that2.fKeepAll != fKeepAll) {
+        return FALSE;
+    }
 
     if (!utext_equals(fText, that2.fText)) {
         // The two break iterators are operating on different text,
@@ -451,10 +489,41 @@ RuleBasedBreakIterator::setText(const UnicodeString& newText) {
 }
 
 
+/**
+ *  Provide a new UText for the input text.  Must reference text with contents identical
+ *  to the original.
+ *  Intended for use with text data originating in Java (garbage collected) environments
+ *  where the data may be moved in memory at arbitrary times.
+ */
+RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) {
+    if (U_FAILURE(status)) {
+        return *this;
+    }
+    if (input == NULL) {
+        status = U_ILLEGAL_ARGUMENT_ERROR;
+        return *this;
+    }
+    int64_t pos = utext_getNativeIndex(fText);
+    //  Shallow read-only clone of the new UText into the existing input UText
+    fText = utext_clone(fText, input, FALSE, TRUE, &status);
+    if (U_FAILURE(status)) {
+        return *this;
+    }
+    utext_setNativeIndex(fText, pos);
+    if (utext_getNativeIndex(fText) != pos) {
+        // Sanity check.  The new input utext is supposed to have the exact same
+        // contents as the old.  If we can't set to the same position, it doesn't.
+        // The contents underlying the old utext might be invalid at this point,
+        // so it's not safe to check directly.
+        status = U_ILLEGAL_ARGUMENT_ERROR;
+    }
+    return *this;
+}
+
 
 /**
- * Sets the current iteration position to the beginning of the text.
- * @return The offset of the beginning of the text.
+ * Sets the current iteration position to the beginning of the text, position zero.
+ * @return The new iterator position, which is zero.
  */
 int32_t RuleBasedBreakIterator::first(void) {
     reset();
@@ -527,7 +596,20 @@ int32_t RuleBasedBreakIterator::next(void) {
     }
 
     int32_t startPos = current();
+    fDictionaryCharCount = 0;
     int32_t result = handleNext(fData->fForwardTable);
+    while (fKeepAll) {
+        UChar32 prevChr = utext_char32At(fText, result-1);
+        UChar32 currChr = utext_char32At(fText, result);
+        if (currChr == U_SENTINEL || prevChr == U_SENTINEL || !u_isalpha(currChr) || !u_isalpha(prevChr)) {
+            break;
+        }
+        int32_t nextResult = handleNext(fData->fForwardTable);
+        if (nextResult <= result) {
+            break;
+        }
+        result = nextResult;
+    }
     if (fDictionaryCharCount > 0) {
         result = checkDictionary(startPos, result, FALSE);
     }
@@ -570,6 +652,18 @@ int32_t RuleBasedBreakIterator::previous(void) {
 
     if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) {
         result = handlePrevious(fData->fReverseTable);
+        while (fKeepAll) {
+            UChar32 prevChr = utext_char32At(fText, result-1);
+            UChar32 currChr = utext_char32At(fText, result);
+            if (currChr == U_SENTINEL || prevChr == U_SENTINEL || !u_isalpha(currChr) || !u_isalpha(prevChr)) {
+                break;
+            }
+            int32_t prevResult = handlePrevious(fData->fReverseTable);
+            if (prevResult >= result) {
+                break;
+            }
+            result = prevResult;
+        }
         if (fDictionaryCharCount > 0) {
             result = checkDictionary(result, startPos, TRUE);
         }
@@ -581,12 +675,11 @@ int32_t RuleBasedBreakIterator::previous(void) {
     // break position before the current position (we back our internal
     // iterator up one step to prevent handlePrevious() from returning
     // the current position), but not necessarily the last one before
-
     // where we started
 
     int32_t start = current();
 
-    UTEXT_PREVIOUS32(fText);
+    (void)UTEXT_PREVIOUS32(fText);
     int32_t lastResult    = handlePrevious(fData->fReverseTable);
     if (lastResult == UBRK_DONE) {
         lastResult = 0;
@@ -614,11 +707,11 @@ int32_t RuleBasedBreakIterator::previous(void) {
     // the result position that we are to return (in lastResult.)  If
     // the backwards rules overshot and the above loop had to do two or more
     // next()s to move up to the desired return position, we will have a valid
-    // tag value. But, if handlePrevious() took us to exactly the correct result positon,
+    // tag value. But, if handlePrevious() took us to exactly the correct result position,
     // we wont have a tag value for that position, which is only set by handleNext().
 
-    // set the current iteration position to be the last break position
-    // before where we started, and then return that value
+    // Set the current iteration position to be the last break position
+    // before where we started, and then return that value.
     utext_setNativeIndex(fText, lastResult);
     fLastRuleStatusIndex  = lastTag;       // for use by getRuleStatus()
     fLastStatusIndexValid = breakTagValid;
@@ -636,6 +729,22 @@ int32_t RuleBasedBreakIterator::previous(void) {
  * @return The position of the first break after the current position.
  */
 int32_t RuleBasedBreakIterator::following(int32_t offset) {
+    // if the offset passed in is already past the end of the text,
+    // just return DONE; if it's before the beginning, return the
+    // text's starting offset
+    if (fText == NULL || offset >= utext_nativeLength(fText)) {
+        last();
+        return next();
+    }
+    else if (offset < 0) {
+        return first();
+    }
+
+    // Move requested offset to a code point start. It might be on a trail surrogate,
+    // or on a trail byte if the input is UTF-8.
+    utext_setNativeIndex(fText, offset);
+    offset = (int32_t)utext_getNativeIndex(fText);
+
     // if we have cached break positions and offset is in the range
     // covered by them, use them
     // TODO: could use binary search
@@ -657,20 +766,7 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
         }
     }
 
-    // if the offset passed in is already past the end of the text,
-    // just return DONE; if it's before the beginning, return the
-    // text's starting offset
-    fLastRuleStatusIndex  = 0;
-    fLastStatusIndexValid = TRUE;
-    if (fText == NULL || offset >= utext_nativeLength(fText)) {
-        last();
-        return next();
-    }
-    else if (offset < 0) {
-        return first();
-    }
-
-    // otherwise, set our internal iteration position (temporarily)
+    // Set our internal iteration position (temporarily)
     // to the position passed in.  If this is the _beginning_ position,
     // then we can just use next() to get our return value
 
@@ -682,7 +778,8 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
         // move forward one codepoint to prepare for moving back to a
         // safe point.
         // this handles offset being between a supplementary character
-        UTEXT_NEXT32(fText);
+        // TODO: is this still needed, with move to code point boundary handled above?
+        (void)UTEXT_NEXT32(fText);
         // handlePrevious will move most of the time to < 1 boundary away
         handlePrevious(fData->fSafeRevTable);
         int32_t result = next();
@@ -694,7 +791,7 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
     if (fData->fSafeFwdTable != NULL) {
         // backup plan if forward safe table is not available
         utext_setNativeIndex(fText, offset);
-        UTEXT_PREVIOUS32(fText);
+        (void)UTEXT_PREVIOUS32(fText);
         // handle next will give result >= offset
         handleNext(fData->fSafeFwdTable);
         // previous will give result 0 or 1 boundary away from offset,
@@ -725,7 +822,7 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
 
     utext_setNativeIndex(fText, offset);
     if (offset==0 || 
-        offset==1  && utext_getNativeIndex(fText)==0) {
+        (offset==1  && utext_getNativeIndex(fText)==0)) {
         return next();
     }
     result = previous();
@@ -744,6 +841,21 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
  * @return The position of the last boundary before the starting position.
  */
 int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
+    // if the offset passed in is already past the end of the text,
+    // just return DONE; if it's before the beginning, return the
+    // text's starting offset
+    if (fText == NULL || offset > utext_nativeLength(fText)) {
+        return last();
+    }
+    else if (offset < 0) {
+        return first();
+    }
+
+    // Move requested offset to a code point start. It might be on a trail surrogate,
+    // or on a trail byte if the input is UTF-8.
+    utext_setNativeIndex(fText, offset);
+    offset = (int32_t)utext_getNativeIndex(fText);
+
     // if we have cached break positions and offset is in the range
     // covered by them, use them
     if (fCachedBreakPositions != NULL) {
@@ -769,17 +881,6 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
         }
     }
 
-    // if the offset passed in is already past the end of the text,
-    // just return DONE; if it's before the beginning, return the
-    // text's starting offset
-    if (fText == NULL || offset > utext_nativeLength(fText)) {
-        // return BreakIterator::DONE;
-        return last();
-    }
-    else if (offset < 0) {
-        return first();
-    }
-
     // if we start by updating the current iteration position to the
     // position specified by the caller, we can just use previous()
     // to carry out this operation
@@ -794,7 +895,7 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
             //   indices to the containing code point.
             // For breakitereator::preceding only, these non-code-point indices need to be moved
             //   up to refer to the following codepoint.
-            UTEXT_NEXT32(fText);
+            (void)UTEXT_NEXT32(fText);
             offset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
         }
 
@@ -803,7 +904,7 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
         //        (Change would interact with safe rules.)
         // TODO:  change RBBI behavior for off-boundary indices to match that of UText?
         //        affects only preceding(), seems cleaner, but is slightly different.
-        UTEXT_PREVIOUS32(fText);
+        (void)UTEXT_PREVIOUS32(fText);
         handleNext(fData->fSafeFwdTable);
         int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
         while (result >= offset) {
@@ -818,7 +919,7 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
         //            if they use safe tables at all.  We have certainly never described
         //            to anyone how to work with just one safe table.
         utext_setNativeIndex(fText, offset);
-        UTEXT_NEXT32(fText);
+        (void)UTEXT_NEXT32(fText);
         
         // handle previous will give result <= offset
         handlePrevious(fData->fSafeRevTable);
@@ -910,6 +1011,54 @@ enum RBBIRunMode {
 };
 
 
+// Map from look-ahead break states (corresponds to rules) to boundary positions.
+// Allows multiple lookahead break rules to be in flight at the same time.
+//
+// This is a temporary approach for ICU 57. A better fix is to make the look-ahead numbers
+// in the state table be sequential, then we can just index an array. And the
+// table could also tell us in advance how big that array needs to be.
+//
+// Before ICU 57 there was just a single simple variable for a look-ahead match that
+// was in progress. Two rules at once did not work.
+
+static const int32_t kMaxLookaheads = 8;
+struct LookAheadResults {
+    int32_t    fUsedSlotLimit;
+    int32_t    fPositions[8];
+    int16_t    fKeys[8];
+
+    LookAheadResults() : fUsedSlotLimit(0), fPositions(), fKeys() {};
+
+    int32_t getPosition(int16_t key) {
+        for (int32_t i=0; i<fUsedSlotLimit; ++i) {
+            if (fKeys[i] == key) {
+                return fPositions[i];
+            }
+        }
+        U_ASSERT(FALSE);
+        return -1;
+    }
+
+    void setPosition(int16_t key, int32_t position) {
+        int32_t i;
+        for (i=0; i<fUsedSlotLimit; ++i) {
+            if (fKeys[i] == key) {
+                fPositions[i] = position;
+                return;
+            }
+        }
+        if (i >= kMaxLookaheads) {
+            U_ASSERT(FALSE);
+            i = kMaxLookaheads - 1;
+        }
+        fKeys[i] = key;
+        fPositions[i] = position;
+        U_ASSERT(fUsedSlotLimit == i);
+        fUsedSlotLimit = i + 1;
+    }
+};
+
+
 //-----------------------------------------------------------------------------------
 //
 //  handleNext(stateTable)
@@ -922,19 +1071,16 @@ enum RBBIRunMode {
 //-----------------------------------------------------------------------------------
 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
     int32_t             state;
-    int16_t             category        = 0;
+    uint16_t            category        = 0;
     RBBIRunMode         mode;
     
     RBBIStateTableRow  *row;
     UChar32             c;
-    int32_t             lookaheadStatus = 0;
-    int32_t             lookaheadTagIdx = 0;
-    int32_t             result          = 0;
-    int32_t             initialPosition = 0;
-    int32_t             lookaheadResult = 0;
-    UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
-    const char         *tableData       = statetable->fTableData;
-    uint32_t            tableRowLen     = statetable->fRowLen;
+    LookAheadResults    lookAheadMatches;
+    int32_t             result             = 0;
+    int32_t             initialPosition    = 0;
+    const char         *tableData          = statetable->fTableData;
+    uint32_t            tableRowLen        = statetable->fRowLen;
 
     #ifdef RBBI_DEBUG
         if (fTrace) {
@@ -977,14 +1123,6 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
                 // We have already run the loop one last time with the 
                 //   character set to the psueudo {eof} value.  Now it is time
                 //   to unconditionally bail out.
-                if (lookaheadResult > result) {
-                    // We ran off the end of the string with a pending look-ahead match.
-                    // Treat this as if the look-ahead condition had been met, and return
-                    //  the match at the / position from the look-ahead rule.
-                    result               = lookaheadResult;
-                    fLastRuleStatusIndex = lookaheadTagIdx;
-                    lookaheadStatus = 0;
-                } 
                 break;
             }
             // Run the loop one last time with the fake end-of-input character category.
@@ -1017,9 +1155,9 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
             }
         }
 
-        #ifdef RBBI_DEBUG
+       #ifdef RBBI_DEBUG
             if (fTrace) {
-                RBBIDebugPrintf("             %4d   ", utext_getNativeIndex(fText));
+                RBBIDebugPrintf("             %4ld   ", utext_getNativeIndex(fText));
                 if (0x20<=c && c<0x7f) {
                     RBBIDebugPrintf("\"%c\"  ", c);
                 } else {
@@ -1031,7 +1169,12 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
 
         // State Transition - move machine to its next state
         //
-        state = row->fNextState[category];
+
+        // Note: fNextState is defined as uint16_t[2], but we are casting
+        // a generated RBBI table to RBBIStateTableRow and some tables
+        // actually have more than 2 categories.
+        U_ASSERT(category<fData->fHeader->fCatCount);
+        state = row->fNextState[category];  /*Not accessing beyond memory*/
         row = (RBBIStateTableRow *)
             // (statetable->fTableData + (statetable->fRowLen * state));
             (tableData + tableRowLen * state);
@@ -1045,38 +1188,23 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
             fLastRuleStatusIndex = row->fTagIdx;   // Remember the break status (tag) values.
         }
 
-        if (row->fLookAhead != 0) {
-            if (lookaheadStatus != 0
-                && row->fAccepting == lookaheadStatus) {
-                // Lookahead match is completed.  
-                result               = lookaheadResult;
-                fLastRuleStatusIndex = lookaheadTagIdx;
-                lookaheadStatus      = 0;
-                // TODO:  make a standalone hard break in a rule work.
-                if (lookAheadHardBreak) {
-                    UTEXT_SETNATIVEINDEX(fText, result);
-                    return result;
-                }
-                // Look-ahead completed, but other rules may match further.  Continue on
-                //  TODO:  junk this feature?  I don't think it's used anywhwere.
-                goto continueOn;
+        int16_t completedRule = row->fAccepting;
+        if (completedRule > 0) {
+            // Lookahead match is completed.  
+            int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule);
+            if (lookaheadResult >= 0) {
+                fLastRuleStatusIndex = row->fTagIdx;
+                UTEXT_SETNATIVEINDEX(fText, lookaheadResult);
+                return lookaheadResult;
             }
-
-            int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
-            lookaheadResult = r;
-            lookaheadStatus = row->fLookAhead;
-            lookaheadTagIdx = row->fTagIdx;
-            goto continueOn;
         }
-
-
-        if (row->fAccepting != 0) {
-            // Because this is an accepting state, any in-progress look-ahead match
-            //   is no longer relavant.  Clear out the pending lookahead status.
-            lookaheadStatus = 0;           // clear out any pending look-ahead match.
+        int16_t rule = row->fLookAhead;
+        if (rule != 0) {
+            // At the position of a '/' in a look-ahead match. Record it.
+            int32_t  pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
+            lookAheadMatches.setPosition(rule, pos);
         }
 
-continueOn:
         if (state == STOP_STATE) {
             // This is the normal exit from the lookup state machine.
             // We have advanced through the string until it is certain that no
@@ -1134,15 +1262,13 @@ continueOn:
 //-----------------------------------------------------------------------------------
 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) {
     int32_t             state;
-    int16_t             category        = 0;
+    uint16_t            category        = 0;
     RBBIRunMode         mode;
     RBBIStateTableRow  *row;
     UChar32             c;
-    int32_t             lookaheadStatus = 0;
+    LookAheadResults    lookAheadMatches;
     int32_t             result          = 0;
     int32_t             initialPosition = 0;
-    int32_t             lookaheadResult = 0;
-    UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
 
     #ifdef RBBI_DEBUG
         if (fTrace) {
@@ -1184,23 +1310,15 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
     for (;;) {
         if (c == U_SENTINEL) {
             // Reached end of input string.
-            if (mode == RBBI_END || 
-                *(int32_t *)fData->fHeader->fFormatVersion == 1 ) {
+            if (mode == RBBI_END) {
                 // We have already run the loop one last time with the 
                 //   character set to the psueudo {eof} value.  Now it is time
                 //   to unconditionally bail out.
-                //  (Or we have an old format binary rule file that does not support {eof}.)
-                if (lookaheadResult < result) {
-                    // We ran off the end of the string with a pending look-ahead match.
-                    // Treat this as if the look-ahead condition had been met, and return
-                    //  the match at the / position from the look-ahead rule.
-                    result               = lookaheadResult;
-                    lookaheadStatus = 0;
-                } else if (result == initialPosition) {
+                if (result == initialPosition) {
                     // Ran off start, no match found.
                     // move one index one (towards the start, since we are doing a previous())
                     UTEXT_SETNATIVEINDEX(fText, initialPosition);
-                    UTEXT_PREVIOUS32(fText);   // TODO:  shouldn't be necessary.  We're already at beginning.  Check.
+                    (void)UTEXT_PREVIOUS32(fText);   // TODO:  shouldn't be necessary.  We're already at beginning.  Check.
                 }
                 break;
             }
@@ -1248,7 +1366,12 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
 
         // State Transition - move machine to its next state
         //
-        state = row->fNextState[category];
+
+        // Note: fNextState is defined as uint16_t[2], but we are casting
+        // a generated RBBI table to RBBIStateTableRow and some tables
+        // actually have more than 2 categories.
+        U_ASSERT(category<fData->fHeader->fCatCount);
+        state = row->fNextState[category];  /*Not accessing beyond memory*/
         row = (RBBIStateTableRow *)
             (statetable->fTableData + (statetable->fRowLen * state));
 
@@ -1257,36 +1380,22 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
             result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
         }
 
-        if (row->fLookAhead != 0) {
-            if (lookaheadStatus != 0
-                && row->fAccepting == lookaheadStatus) {
-                // Lookahead match is completed.  
-                result               = lookaheadResult;
-                lookaheadStatus      = 0;
-                // TODO:  make a standalone hard break in a rule work.
-                if (lookAheadHardBreak) {
-                    UTEXT_SETNATIVEINDEX(fText, result);
-                    return result;
-                }
-                // Look-ahead completed, but other rules may match further.  Continue on
-                //  TODO:  junk this feature?  I don't think it's used anywhwere.
-                goto continueOn;
+        int16_t completedRule = row->fAccepting;
+        if (completedRule > 0) {
+            // Lookahead match is completed.  
+            int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule);
+            if (lookaheadResult >= 0) {
+                UTEXT_SETNATIVEINDEX(fText, lookaheadResult);
+                return lookaheadResult;
             }
-
-            int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
-            lookaheadResult = r;
-            lookaheadStatus = row->fLookAhead;
-            goto continueOn;
         }
-
-
-        if (row->fAccepting != 0) {
-            // Because this is an accepting state, any in-progress look-ahead match
-            //   is no longer relavant.  Clear out the pending lookahead status.
-            lookaheadStatus = 0;    
+        int16_t rule = row->fLookAhead;
+        if (rule != 0) {
+            // At the position of a '/' in a look-ahead match. Record it.
+            int32_t  pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
+            lookAheadMatches.setPosition(rule, pos);
         }
 
-continueOn:
         if (state == STOP_STATE) {
             // This is the normal exit from the lookup state machine.
             // We have advanced through the string until it is certain that no
@@ -1441,19 +1550,7 @@ const uint8_t  *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
 }
 
 
-
-
-//-------------------------------------------------------------------------------
-//
-//  BufferClone       TODO:  In my (Andy) opinion, this function should be deprecated.
-//                    Saving one heap allocation isn't worth the trouble.
-//                    Cloning shouldn't be done in tight loops, and
-//                    making the clone copy involves other heap operations anyway.
-//                    And the application code for correctly dealing with buffer
-//                    size problems and the eventual object destruction is ugly.
-//
-//-------------------------------------------------------------------------------
-BreakIterator *  RuleBasedBreakIterator::createBufferClone(void *stackBuffer,
+BreakIterator *  RuleBasedBreakIterator::createBufferClone(void * /*stackBuffer*/,
                                    int32_t &bufferSize,
                                    UErrorCode &status)
 {
@@ -1461,62 +1558,18 @@ BreakIterator *  RuleBasedBreakIterator::createBufferClone(void *stackBuffer,
         return NULL;
     }
 
-    //
-    //  If user buffer size is zero this is a preflight operation to
-    //    obtain the needed buffer size, allowing for worst case misalignment.
-    //
     if (bufferSize == 0) {
-        bufferSize = sizeof(RuleBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0);
+        bufferSize = 1;  // preflighting for deprecated functionality
         return NULL;
     }
 
-
-    //
-    //  Check the alignment and size of the user supplied buffer.
-    //  Allocate heap memory if the user supplied memory is insufficient.
-    //
-    char    *buf   = (char *)stackBuffer;
-    uint32_t s      = bufferSize;
-
-    if (stackBuffer == NULL) {
-        s = 0;   // Ignore size, force allocation if user didn't give us a buffer.
-    }
-    if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
-        uint32_t offsetUp = (uint32_t)U_ALIGNMENT_OFFSET_UP(buf);
-        s   -= offsetUp;
-        buf += offsetUp;
-    }
-    if (s < sizeof(RuleBasedBreakIterator)) {
-        // Not enough room in the caller-supplied buffer.
-        // Do a plain-vanilla heap based clone and return that, along with
-        //   a warning that the clone was allocated.
-        RuleBasedBreakIterator *clonedBI = new RuleBasedBreakIterator(*this);
-        if (clonedBI == 0) {
-            status = U_MEMORY_ALLOCATION_ERROR;
-        } else {
-            status = U_SAFECLONE_ALLOCATED_WARNING;
-        }
-        return clonedBI;
+    BreakIterator *clonedBI = clone();
+    if (clonedBI == NULL) {
+        status = U_MEMORY_ALLOCATION_ERROR;
+    } else {
+        status = U_SAFECLONE_ALLOCATED_WARNING;
     }
-
-    //
-    //  Clone the source BI into the caller-supplied buffer.
-    //    TODO:  using an overloaded operator new to directly initialize the
-    //           copy in the user's buffer would be better, but it doesn't seem
-    //           to get along with namespaces.  Investigate why.
-    //
-    //           The memcpy is only safe with an empty (default constructed)
-    //           break iterator.  Use on others can screw up reference counts
-    //           to data.  memcpy-ing objects is not really a good idea...
-    //
-    RuleBasedBreakIterator localIter;        // Empty break iterator, source for memcpy
-    RuleBasedBreakIterator *clone = (RuleBasedBreakIterator *)buf;
-    uprv_memcpy(clone, &localIter, sizeof(RuleBasedBreakIterator)); // init C++ gorp, BreakIterator base class part
-    clone->init();                // Init RuleBasedBreakIterator part, (user default constructor)
-    *clone = *this;               // clone = the real BI we want.
-    clone->fBufferClone = TRUE;   // Flag to prevent deleting storage on close (From C code)
-
-    return clone;
+    return (RuleBasedBreakIterator *)clonedBI;
 }
 
 
@@ -1552,10 +1605,12 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
                             int32_t endPos,
                             UBool reverse) {
     // Reset the old break cache first.
-    uint32_t dictionaryCount = fDictionaryCharCount;
     reset();
 
-    if (dictionaryCount <= 1 || (endPos - startPos) <= 1) {
+    // note: code segment below assumes that dictionary chars are in the 
+    // startPos-endPos range
+    // value returned should be next character in sequence
+    if ((endPos - startPos) <= 1) {
         return (reverse ? startPos : endPos);
     }
     
@@ -1660,6 +1715,7 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
     // If we found breaks, build a new break cache. The first and last entries must
     // be the original starting and ending position.
     if (foundBreakCount > 0) {
+        U_ASSERT(foundBreakCount == breaks.size());
         int32_t totalBreaks = foundBreakCount;
         if (startPos < breaks.elementAti(0)) {
             totalBreaks += 1;
@@ -1684,7 +1740,7 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
             // proposed break by one of the breaks we found. Use following() and
             // preceding() to do the work. They should never recurse in this case.
             if (reverse) {
-                return preceding(endPos - 1);
+                return preceding(endPos);
             }
             else {
                 return following(startPos);
@@ -1701,9 +1757,9 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
 
 U_NAMESPACE_END
 
-// defined in ucln_cmn.h
 
-static U_NAMESPACE_QUALIFIER UStack *gLanguageBreakFactories = NULL;
+static icu::UStack *gLanguageBreakFactories = NULL;
+static icu::UInitOnce gLanguageBreakFactoriesInitOnce = U_INITONCE_INITIALIZER;
 
 /**
  * Release all static memory held by breakiterator.  
@@ -1714,46 +1770,40 @@ static UBool U_CALLCONV breakiterator_cleanup_dict(void) {
         delete gLanguageBreakFactories;
         gLanguageBreakFactories = NULL;
     }
+    gLanguageBreakFactoriesInitOnce.reset();
     return TRUE;
 }
 U_CDECL_END
 
 U_CDECL_BEGIN
 static void U_CALLCONV _deleteFactory(void *obj) {
-    delete (U_NAMESPACE_QUALIFIER LanguageBreakFactory *) obj;
+    delete (icu::LanguageBreakFactory *) obj;
 }
 U_CDECL_END
 U_NAMESPACE_BEGIN
 
-static const LanguageBreakEngine*
-getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
-{
-    UBool       needsInit;
-    UErrorCode  status = U_ZERO_ERROR;
-    UMTX_CHECK(NULL, (UBool)(gLanguageBreakFactories == NULL), needsInit);
-    
-    if (needsInit) {
-        UStack  *factories = new UStack(_deleteFactory, NULL, status);
-        if (factories != NULL && U_SUCCESS(status)) {
-            ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
-            factories->push(builtIn, status);
+static void U_CALLCONV initLanguageFactories() {
+    UErrorCode status = U_ZERO_ERROR;
+    U_ASSERT(gLanguageBreakFactories == NULL);
+    gLanguageBreakFactories = new UStack(_deleteFactory, NULL, status);
+    if (gLanguageBreakFactories != NULL && U_SUCCESS(status)) {
+        ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
+        gLanguageBreakFactories->push(builtIn, status);
 #ifdef U_LOCAL_SERVICE_HOOK
-            LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
-            if (extra != NULL) {
-                factories->push(extra, status);
-            }
-#endif
-        }
-        umtx_lock(NULL);
-        if (gLanguageBreakFactories == NULL) {
-            gLanguageBreakFactories = factories;
-            factories = NULL;
-            ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict);
+        LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
+        if (extra != NULL) {
+            gLanguageBreakFactories->push(extra, status);
         }
-        umtx_unlock(NULL);
-        delete factories;
+#endif
     }
-    
+    ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict);
+}
+
+
+static const LanguageBreakEngine*
+getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
+{
+    umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories);
     if (gLanguageBreakFactories == NULL) {
         return NULL;
     }
@@ -1774,7 +1824,7 @@ getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
 //-------------------------------------------------------------------------------
 //
 //  getLanguageBreakEngine  Find an appropriate LanguageBreakEngine for the
-//                          the characer c.
+//                          the character c.
 //
 //-------------------------------------------------------------------------------
 const LanguageBreakEngine *