/*
***************************************************************************
-* Copyright (C) 1999-2006 International Business Machines Corporation *
-* and others. All rights reserved. *
+* Copyright (C) 1999-2016 International Business Machines Corporation
+* and others. All rights reserved.
***************************************************************************
*/
//
// class RuleBasedBreakIterator
//
+#include "utypeinfo.h" // for 'typeid' to work
+
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "rbbirb.h"
#include "cmemory.h"
#include "cstring.h"
-#include "mutex.h"
+#include "umutex.h"
#include "ucln_cmn.h"
#include "brkeng.h"
U_NAMESPACE_BEGIN
+// The state number of the starting state
+#define START_STATE 1
-static const int16_t START_STATE = 1; // The state number of the starting state
-static const int16_t STOP_STATE = 0; // The state-transition value indicating "stop"
+// The state-transition value indicating "stop"
+#define STOP_STATE 0
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
}
}
+/**
+ * Same as above but does not adopt memory
+ */
+RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status)
+{
+ init();
+ fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // status checked in constructor
+ if (U_FAILURE(status)) {return;}
+ if(fData == 0) {
+ status = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+}
+
+
+//
+// Construct from precompiled binary rules (tables). This constructor is public API,
+// taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules().
+//
+RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
+ uint32_t ruleLength,
+ UErrorCode &status) {
+ init();
+ if (U_FAILURE(status)) {
+ return;
+ }
+ if (compiledRules == NULL || ruleLength < sizeof(RBBIDataHeader)) {
+ status = U_ILLEGAL_ARGUMENT_ERROR;
+ return;
+ }
+ const RBBIDataHeader *data = (const RBBIDataHeader *)compiledRules;
+ if (data->fLength > ruleLength) {
+ status = U_ILLEGAL_ARGUMENT_ERROR;
+ return;
+ }
+ fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status);
+ if (U_FAILURE(status)) {return;}
+ if(fData == 0) {
+ status = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+}
+
+
//-------------------------------------------------------------------------------
//
// Constructor from a UDataMemory handle to precompiled break rules
init();
if (U_FAILURE(status)) {return;}
RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
- RBBIRuleBuilder::createRuleBasedBreakIterator(rules, parseError, status);
+ RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status);
// Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that
// creates and returns a complete RBBI. From here, in a constructor, we
// can't just return the object created by the builder factory, hence
if (this == &that) {
return *this;
}
+ fKeepAll = that.fKeepAll;
reset(); // Delete break cache information
fBreakType = that.fBreakType;
if (fLanguageBreakEngines != NULL) {
//-----------------------------------------------------------------------------
void RuleBasedBreakIterator::init() {
UErrorCode status = U_ZERO_ERROR;
- fBufferClone = FALSE;
fText = utext_openUChars(NULL, NULL, 0, &status);
fCharIter = NULL;
fSCharIter = NULL;
fLastRuleStatusIndex = 0;
fLastStatusIndexValid = TRUE;
fDictionaryCharCount = 0;
- fBreakType = -1;
+ fBreakType = UBRK_WORD; // Defaulting BreakType to word gives reasonable
+ // dictionary behavior for Break Iterators that are
+ // built from rules. Even better would be the ability to
+ // declare the type in the rules.
fCachedBreakPositions = NULL;
fLanguageBreakEngines = NULL;
*/
UBool
RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
- if (that.getDynamicClassID() != getDynamicClassID()) {
+ if (typeid(*this) != typeid(that)) {
return FALSE;
}
const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;
+ if (that2.fKeepAll != fKeepAll) {
+ return FALSE;
+ }
if (!utext_equals(fText, that2.fText)) {
// The two break iterators are operating on different text,
// we can come to signaling a failure.
// (GetText() is obsolete, this failure is sort of OK)
if (fDCharIter == NULL) {
- static UChar c = 0;
+ static const UChar c = 0;
fDCharIter = new UCharCharacterIterator(&c, 0);
+ if (fDCharIter == NULL) {
+ status = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
}
if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
}
+/**
+ * Provide a new UText for the input text. Must reference text with contents identical
+ * to the original.
+ * Intended for use with text data originating in Java (garbage collected) environments
+ * where the data may be moved in memory at arbitrary times.
+ */
+RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) {
+ if (U_FAILURE(status)) {
+ return *this;
+ }
+ if (input == NULL) {
+ status = U_ILLEGAL_ARGUMENT_ERROR;
+ return *this;
+ }
+ int64_t pos = utext_getNativeIndex(fText);
+ // Shallow read-only clone of the new UText into the existing input UText
+ fText = utext_clone(fText, input, FALSE, TRUE, &status);
+ if (U_FAILURE(status)) {
+ return *this;
+ }
+ utext_setNativeIndex(fText, pos);
+ if (utext_getNativeIndex(fText) != pos) {
+ // Sanity check. The new input utext is supposed to have the exact same
+ // contents as the old. If we can't set to the same position, it doesn't.
+ // The contents underlying the old utext might be invalid at this point,
+ // so it's not safe to check directly.
+ status = U_ILLEGAL_ARGUMENT_ERROR;
+ }
+ return *this;
+}
+
/**
- * Sets the current iteration position to the beginning of the text.
- * @return The offset of the beginning of the text.
+ * Sets the current iteration position to the beginning of the text, position zero.
+ * @return The new iterator position, which is zero.
*/
int32_t RuleBasedBreakIterator::first(void) {
reset();
}
int32_t startPos = current();
+ fDictionaryCharCount = 0;
int32_t result = handleNext(fData->fForwardTable);
+ while (fKeepAll) {
+ UChar32 prevChr = utext_char32At(fText, result-1);
+ UChar32 currChr = utext_char32At(fText, result);
+ if (currChr == U_SENTINEL || prevChr == U_SENTINEL || !u_isalpha(currChr) || !u_isalpha(prevChr)) {
+ break;
+ }
+ int32_t nextResult = handleNext(fData->fForwardTable);
+ if (nextResult <= result) {
+ break;
+ }
+ result = nextResult;
+ }
if (fDictionaryCharCount > 0) {
result = checkDictionary(startPos, result, FALSE);
}
if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) {
result = handlePrevious(fData->fReverseTable);
+ while (fKeepAll) {
+ UChar32 prevChr = utext_char32At(fText, result-1);
+ UChar32 currChr = utext_char32At(fText, result);
+ if (currChr == U_SENTINEL || prevChr == U_SENTINEL || !u_isalpha(currChr) || !u_isalpha(prevChr)) {
+ break;
+ }
+ int32_t prevResult = handlePrevious(fData->fReverseTable);
+ if (prevResult >= result) {
+ break;
+ }
+ result = prevResult;
+ }
if (fDictionaryCharCount > 0) {
result = checkDictionary(result, startPos, TRUE);
}
// break position before the current position (we back our internal
// iterator up one step to prevent handlePrevious() from returning
// the current position), but not necessarily the last one before
-
// where we started
int32_t start = current();
- UTEXT_PREVIOUS32(fText);
+ (void)UTEXT_PREVIOUS32(fText);
int32_t lastResult = handlePrevious(fData->fReverseTable);
if (lastResult == UBRK_DONE) {
lastResult = 0;
// the result position that we are to return (in lastResult.) If
// the backwards rules overshot and the above loop had to do two or more
// next()s to move up to the desired return position, we will have a valid
- // tag value. But, if handlePrevious() took us to exactly the correct result positon,
+ // tag value. But, if handlePrevious() took us to exactly the correct result position,
// we wont have a tag value for that position, which is only set by handleNext().
- // set the current iteration position to be the last break position
- // before where we started, and then return that value
+ // Set the current iteration position to be the last break position
+ // before where we started, and then return that value.
utext_setNativeIndex(fText, lastResult);
fLastRuleStatusIndex = lastTag; // for use by getRuleStatus()
fLastStatusIndexValid = breakTagValid;
* @return The position of the first break after the current position.
*/
int32_t RuleBasedBreakIterator::following(int32_t offset) {
+ // if the offset passed in is already past the end of the text,
+ // just return DONE; if it's before the beginning, return the
+ // text's starting offset
+ if (fText == NULL || offset >= utext_nativeLength(fText)) {
+ last();
+ return next();
+ }
+ else if (offset < 0) {
+ return first();
+ }
+
+ // Move requested offset to a code point start. It might be on a trail surrogate,
+ // or on a trail byte if the input is UTF-8.
+ utext_setNativeIndex(fText, offset);
+ offset = (int32_t)utext_getNativeIndex(fText);
+
// if we have cached break positions and offset is in the range
// covered by them, use them
// TODO: could use binary search
}
}
- // if the offset passed in is already past the end of the text,
- // just return DONE; if it's before the beginning, return the
- // text's starting offset
- fLastRuleStatusIndex = 0;
- fLastStatusIndexValid = TRUE;
- if (fText == NULL || offset >= utext_nativeLength(fText)) {
- last();
- return next();
- }
- else if (offset < 0) {
- return first();
- }
-
- // otherwise, set our internal iteration position (temporarily)
+ // Set our internal iteration position (temporarily)
// to the position passed in. If this is the _beginning_ position,
// then we can just use next() to get our return value
// move forward one codepoint to prepare for moving back to a
// safe point.
// this handles offset being between a supplementary character
- UTEXT_NEXT32(fText);
+ // TODO: is this still needed, with move to code point boundary handled above?
+ (void)UTEXT_NEXT32(fText);
// handlePrevious will move most of the time to < 1 boundary away
handlePrevious(fData->fSafeRevTable);
int32_t result = next();
if (fData->fSafeFwdTable != NULL) {
// backup plan if forward safe table is not available
utext_setNativeIndex(fText, offset);
- UTEXT_PREVIOUS32(fText);
+ (void)UTEXT_PREVIOUS32(fText);
// handle next will give result >= offset
handleNext(fData->fSafeFwdTable);
// previous will give result 0 or 1 boundary away from offset,
utext_setNativeIndex(fText, offset);
if (offset==0 ||
- offset==1 && utext_getNativeIndex(fText)==0) {
+ (offset==1 && utext_getNativeIndex(fText)==0)) {
return next();
}
result = previous();
* @return The position of the last boundary before the starting position.
*/
int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
+ // if the offset passed in is already past the end of the text,
+ // just return DONE; if it's before the beginning, return the
+ // text's starting offset
+ if (fText == NULL || offset > utext_nativeLength(fText)) {
+ return last();
+ }
+ else if (offset < 0) {
+ return first();
+ }
+
+ // Move requested offset to a code point start. It might be on a trail surrogate,
+ // or on a trail byte if the input is UTF-8.
+ utext_setNativeIndex(fText, offset);
+ offset = (int32_t)utext_getNativeIndex(fText);
+
// if we have cached break positions and offset is in the range
// covered by them, use them
if (fCachedBreakPositions != NULL) {
}
}
- // if the offset passed in is already past the end of the text,
- // just return DONE; if it's before the beginning, return the
- // text's starting offset
- if (fText == NULL || offset > utext_nativeLength(fText)) {
- // return BreakIterator::DONE;
- return last();
- }
- else if (offset < 0) {
- return first();
- }
-
// if we start by updating the current iteration position to the
// position specified by the caller, we can just use previous()
// to carry out this operation
// indices to the containing code point.
// For breakitereator::preceding only, these non-code-point indices need to be moved
// up to refer to the following codepoint.
- UTEXT_NEXT32(fText);
+ (void)UTEXT_NEXT32(fText);
offset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
}
// (Change would interact with safe rules.)
// TODO: change RBBI behavior for off-boundary indices to match that of UText?
// affects only preceding(), seems cleaner, but is slightly different.
- UTEXT_PREVIOUS32(fText);
+ (void)UTEXT_PREVIOUS32(fText);
handleNext(fData->fSafeFwdTable);
int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
while (result >= offset) {
// if they use safe tables at all. We have certainly never described
// to anyone how to work with just one safe table.
utext_setNativeIndex(fText, offset);
- UTEXT_NEXT32(fText);
+ (void)UTEXT_NEXT32(fText);
// handle previous will give result <= offset
handlePrevious(fData->fSafeRevTable);
};
+// Map from look-ahead break states (corresponds to rules) to boundary positions.
+// Allows multiple lookahead break rules to be in flight at the same time.
+//
+// This is a temporary approach for ICU 57. A better fix is to make the look-ahead numbers
+// in the state table be sequential, then we can just index an array. And the
+// table could also tell us in advance how big that array needs to be.
+//
+// Before ICU 57 there was just a single simple variable for a look-ahead match that
+// was in progress. Two rules at once did not work.
+
+static const int32_t kMaxLookaheads = 8;
+struct LookAheadResults {
+ int32_t fUsedSlotLimit;
+ int32_t fPositions[8];
+ int16_t fKeys[8];
+
+ LookAheadResults() : fUsedSlotLimit(0), fPositions(), fKeys() {};
+
+ int32_t getPosition(int16_t key) {
+ for (int32_t i=0; i<fUsedSlotLimit; ++i) {
+ if (fKeys[i] == key) {
+ return fPositions[i];
+ }
+ }
+ U_ASSERT(FALSE);
+ return -1;
+ }
+
+ void setPosition(int16_t key, int32_t position) {
+ int32_t i;
+ for (i=0; i<fUsedSlotLimit; ++i) {
+ if (fKeys[i] == key) {
+ fPositions[i] = position;
+ return;
+ }
+ }
+ if (i >= kMaxLookaheads) {
+ U_ASSERT(FALSE);
+ i = kMaxLookaheads - 1;
+ }
+ fKeys[i] = key;
+ fPositions[i] = position;
+ U_ASSERT(fUsedSlotLimit == i);
+ fUsedSlotLimit = i + 1;
+ }
+};
+
+
//-----------------------------------------------------------------------------------
//
// handleNext(stateTable)
//-----------------------------------------------------------------------------------
int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
int32_t state;
- int16_t category = 0;
+ uint16_t category = 0;
RBBIRunMode mode;
RBBIStateTableRow *row;
UChar32 c;
- int32_t lookaheadStatus = 0;
- int32_t lookaheadTagIdx = 0;
- int32_t result = 0;
- int32_t initialPosition = 0;
- int32_t lookaheadResult = 0;
- UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
- const char *tableData = statetable->fTableData;
- uint32_t tableRowLen = statetable->fRowLen;
+ LookAheadResults lookAheadMatches;
+ int32_t result = 0;
+ int32_t initialPosition = 0;
+ const char *tableData = statetable->fTableData;
+ uint32_t tableRowLen = statetable->fRowLen;
#ifdef RBBI_DEBUG
if (fTrace) {
// We have already run the loop one last time with the
// character set to the psueudo {eof} value. Now it is time
// to unconditionally bail out.
- if (lookaheadResult > result) {
- // We ran off the end of the string with a pending look-ahead match.
- // Treat this as if the look-ahead condition had been met, and return
- // the match at the / position from the look-ahead rule.
- result = lookaheadResult;
- fLastRuleStatusIndex = lookaheadTagIdx;
- lookaheadStatus = 0;
- }
break;
}
// Run the loop one last time with the fake end-of-input character category.
}
}
- #ifdef RBBI_DEBUG
+ #ifdef RBBI_DEBUG
if (fTrace) {
- RBBIDebugPrintf(" %4d ", utext_getNativeIndex(fText));
+ RBBIDebugPrintf(" %4ld ", utext_getNativeIndex(fText));
if (0x20<=c && c<0x7f) {
RBBIDebugPrintf("\"%c\" ", c);
} else {
// State Transition - move machine to its next state
//
- state = row->fNextState[category];
+
+ // Note: fNextState is defined as uint16_t[2], but we are casting
+ // a generated RBBI table to RBBIStateTableRow and some tables
+ // actually have more than 2 categories.
+ U_ASSERT(category<fData->fHeader->fCatCount);
+ state = row->fNextState[category]; /*Not accessing beyond memory*/
row = (RBBIStateTableRow *)
// (statetable->fTableData + (statetable->fRowLen * state));
(tableData + tableRowLen * state);
fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values.
}
- if (row->fLookAhead != 0) {
- if (lookaheadStatus != 0
- && row->fAccepting == lookaheadStatus) {
- // Lookahead match is completed.
- result = lookaheadResult;
- fLastRuleStatusIndex = lookaheadTagIdx;
- lookaheadStatus = 0;
- // TODO: make a standalone hard break in a rule work.
- if (lookAheadHardBreak) {
- utext_setNativeIndex(fText, result);
- return result;
- }
- // Look-ahead completed, but other rules may match further. Continue on
- // TODO: junk this feature? I don't think it's used anywhwere.
- goto continueOn;
+ int16_t completedRule = row->fAccepting;
+ if (completedRule > 0) {
+ // Lookahead match is completed.
+ int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule);
+ if (lookaheadResult >= 0) {
+ fLastRuleStatusIndex = row->fTagIdx;
+ UTEXT_SETNATIVEINDEX(fText, lookaheadResult);
+ return lookaheadResult;
}
-
- int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
- lookaheadResult = r;
- lookaheadStatus = row->fLookAhead;
- lookaheadTagIdx = row->fTagIdx;
- goto continueOn;
}
-
-
- if (row->fAccepting != 0) {
- // Because this is an accepting state, any in-progress look-ahead match
- // is no longer relavant. Clear out the pending lookahead status.
- lookaheadStatus = 0; // clear out any pending look-ahead match.
+ int16_t rule = row->fLookAhead;
+ if (rule != 0) {
+ // At the position of a '/' in a look-ahead match. Record it.
+ int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
+ lookAheadMatches.setPosition(rule, pos);
}
-continueOn:
if (state == STOP_STATE) {
// This is the normal exit from the lookup state machine.
// We have advanced through the string until it is certain that no
// (This really indicates a defect in the break rules. They should always match
// at least one character.)
if (result == initialPosition) {
- utext_setNativeIndex(fText, initialPosition);
+ UTEXT_SETNATIVEINDEX(fText, initialPosition);
UTEXT_NEXT32(fText);
result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
}
// Leave the iterator at our result position.
- utext_setNativeIndex(fText, result);
+ UTEXT_SETNATIVEINDEX(fText, result);
#ifdef RBBI_DEBUG
if (fTrace) {
RBBIDebugPrintf("result = %d\n\n", result);
//-----------------------------------------------------------------------------------
int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) {
int32_t state;
- int16_t category = 0;
+ uint16_t category = 0;
RBBIRunMode mode;
RBBIStateTableRow *row;
UChar32 c;
- int32_t lookaheadStatus = 0;
+ LookAheadResults lookAheadMatches;
int32_t result = 0;
int32_t initialPosition = 0;
- int32_t lookaheadResult = 0;
- UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
#ifdef RBBI_DEBUG
if (fTrace) {
for (;;) {
if (c == U_SENTINEL) {
// Reached end of input string.
- if (mode == RBBI_END ||
- *(int32_t *)fData->fHeader->fFormatVersion == 1 ) {
+ if (mode == RBBI_END) {
// We have already run the loop one last time with the
// character set to the psueudo {eof} value. Now it is time
// to unconditionally bail out.
- // (Or we have an old format binary rule file that does not support {eof}.)
- if (lookaheadResult < result) {
- // We ran off the end of the string with a pending look-ahead match.
- // Treat this as if the look-ahead condition had been met, and return
- // the match at the / position from the look-ahead rule.
- result = lookaheadResult;
- lookaheadStatus = 0;
- } else if (result == initialPosition) {
+ if (result == initialPosition) {
// Ran off start, no match found.
// move one index one (towards the start, since we are doing a previous())
- utext_setNativeIndex(fText, initialPosition);
- UTEXT_PREVIOUS32(fText); // TODO: shouldn't be necessary. We're already at beginning. Check.
+ UTEXT_SETNATIVEINDEX(fText, initialPosition);
+ (void)UTEXT_PREVIOUS32(fText); // TODO: shouldn't be necessary. We're already at beginning. Check.
}
break;
}
// State Transition - move machine to its next state
//
- state = row->fNextState[category];
+
+ // Note: fNextState is defined as uint16_t[2], but we are casting
+ // a generated RBBI table to RBBIStateTableRow and some tables
+ // actually have more than 2 categories.
+ U_ASSERT(category<fData->fHeader->fCatCount);
+ state = row->fNextState[category]; /*Not accessing beyond memory*/
row = (RBBIStateTableRow *)
(statetable->fTableData + (statetable->fRowLen * state));
result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
}
- if (row->fLookAhead != 0) {
- if (lookaheadStatus != 0
- && row->fAccepting == lookaheadStatus) {
- // Lookahead match is completed.
- result = lookaheadResult;
- lookaheadStatus = 0;
- // TODO: make a standalone hard break in a rule work.
- if (lookAheadHardBreak) {
- utext_setNativeIndex(fText, result);
- return result;
- }
- // Look-ahead completed, but other rules may match further. Continue on
- // TODO: junk this feature? I don't think it's used anywhwere.
- goto continueOn;
+ int16_t completedRule = row->fAccepting;
+ if (completedRule > 0) {
+ // Lookahead match is completed.
+ int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule);
+ if (lookaheadResult >= 0) {
+ UTEXT_SETNATIVEINDEX(fText, lookaheadResult);
+ return lookaheadResult;
}
-
- int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
- lookaheadResult = r;
- lookaheadStatus = row->fLookAhead;
- goto continueOn;
}
-
-
- if (row->fAccepting != 0) {
- // Because this is an accepting state, any in-progress look-ahead match
- // is no longer relavant. Clear out the pending lookahead status.
- lookaheadStatus = 0;
+ int16_t rule = row->fLookAhead;
+ if (rule != 0) {
+ // At the position of a '/' in a look-ahead match. Record it.
+ int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
+ lookAheadMatches.setPosition(rule, pos);
}
-continueOn:
if (state == STOP_STATE) {
// This is the normal exit from the lookup state machine.
// We have advanced through the string until it is certain that no
// (This really indicates a defect in the break rules. They should always match
// at least one character.)
if (result == initialPosition) {
- utext_setNativeIndex(fText, initialPosition);
+ UTEXT_SETNATIVEINDEX(fText, initialPosition);
UTEXT_PREVIOUS32(fText);
result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
}
// Leave the iterator at our result position.
- utext_setNativeIndex(fText, result);
+ UTEXT_SETNATIVEINDEX(fText, result);
#ifdef RBBI_DEBUG
if (fTrace) {
RBBIDebugPrintf("result = %d\n\n", result);
}
-
-
-//-------------------------------------------------------------------------------
-//
-// BufferClone TODO: In my (Andy) opinion, this function should be deprecated.
-// Saving one heap allocation isn't worth the trouble.
-// Cloning shouldn't be done in tight loops, and
-// making the clone copy involves other heap operations anyway.
-// And the application code for correctly dealing with buffer
-// size problems and the eventual object destruction is ugly.
-//
-//-------------------------------------------------------------------------------
-BreakIterator * RuleBasedBreakIterator::createBufferClone(void *stackBuffer,
+BreakIterator * RuleBasedBreakIterator::createBufferClone(void * /*stackBuffer*/,
int32_t &bufferSize,
UErrorCode &status)
{
return NULL;
}
- //
- // If user buffer size is zero this is a preflight operation to
- // obtain the needed buffer size, allowing for worst case misalignment.
- //
if (bufferSize == 0) {
- bufferSize = sizeof(RuleBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0);
+ bufferSize = 1; // preflighting for deprecated functionality
return NULL;
}
-
- //
- // Check the alignment and size of the user supplied buffer.
- // Allocate heap memory if the user supplied memory is insufficient.
- //
- char *buf = (char *)stackBuffer;
- uint32_t s = bufferSize;
-
- if (stackBuffer == NULL) {
- s = 0; // Ignore size, force allocation if user didn't give us a buffer.
- }
- if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
- uint32_t offsetUp = (uint32_t)U_ALIGNMENT_OFFSET_UP(buf);
- s -= offsetUp;
- buf += offsetUp;
- }
- if (s < sizeof(RuleBasedBreakIterator)) {
- // Not enough room in the caller-supplied buffer.
- // Do a plain-vanilla heap based clone and return that, along with
- // a warning that the clone was allocated.
- RuleBasedBreakIterator *clonedBI = new RuleBasedBreakIterator(*this);
- if (clonedBI == 0) {
- status = U_MEMORY_ALLOCATION_ERROR;
- } else {
- status = U_SAFECLONE_ALLOCATED_WARNING;
- }
- return clonedBI;
+ BreakIterator *clonedBI = clone();
+ if (clonedBI == NULL) {
+ status = U_MEMORY_ALLOCATION_ERROR;
+ } else {
+ status = U_SAFECLONE_ALLOCATED_WARNING;
}
-
- //
- // Clone the source BI into the caller-supplied buffer.
- // TODO: using an overloaded operator new to directly initialize the
- // copy in the user's buffer would be better, but it doesn't seem
- // to get along with namespaces. Investigate why.
- //
- // The memcpy is only safe with an empty (default constructed)
- // break iterator. Use on others can screw up reference counts
- // to data. memcpy-ing objects is not really a good idea...
- //
- RuleBasedBreakIterator localIter; // Empty break iterator, source for memcpy
- RuleBasedBreakIterator *clone = (RuleBasedBreakIterator *)buf;
- uprv_memcpy(clone, &localIter, sizeof(RuleBasedBreakIterator)); // init C++ gorp, BreakIterator base class part
- clone->init(); // Init RuleBasedBreakIterator part, (user default constructor)
- *clone = *this; // clone = the real BI we want.
- clone->fBufferClone = TRUE; // Flag to prevent deleting storage on close (From C code)
-
- return clone;
+ return (RuleBasedBreakIterator *)clonedBI;
}
int32_t endPos,
UBool reverse) {
// Reset the old break cache first.
- uint32_t dictionaryCount = fDictionaryCharCount;
reset();
- if (dictionaryCount <= 1 || (endPos - startPos) <= 1) {
+ // note: code segment below assumes that dictionary chars are in the
+ // startPos-endPos range
+ // value returned should be next character in sequence
+ if ((endPos - startPos) <= 1) {
return (reverse ? startPos : endPos);
}
// If we found breaks, build a new break cache. The first and last entries must
// be the original starting and ending position.
if (foundBreakCount > 0) {
+ U_ASSERT(foundBreakCount == breaks.size());
int32_t totalBreaks = foundBreakCount;
if (startPos < breaks.elementAti(0)) {
totalBreaks += 1;
// proposed break by one of the breaks we found. Use following() and
// preceding() to do the work. They should never recurse in this case.
if (reverse) {
- return preceding(endPos - 1);
+ return preceding(endPos);
}
else {
return following(startPos);
return (reverse ? startPos : endPos);
}
-static UStack *gLanguageBreakFactories = NULL;
-
U_NAMESPACE_END
-// defined in ucln_cmn.h
+
+static icu::UStack *gLanguageBreakFactories = NULL;
+static icu::UInitOnce gLanguageBreakFactoriesInitOnce = U_INITONCE_INITIALIZER;
/**
* Release all static memory held by breakiterator.
delete gLanguageBreakFactories;
gLanguageBreakFactories = NULL;
}
+ gLanguageBreakFactoriesInitOnce.reset();
return TRUE;
}
U_CDECL_END
U_CDECL_BEGIN
static void U_CALLCONV _deleteFactory(void *obj) {
- delete (LanguageBreakFactory *) obj;
+ delete (icu::LanguageBreakFactory *) obj;
}
U_CDECL_END
U_NAMESPACE_BEGIN
-static const LanguageBreakEngine*
-getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
-{
- UBool needsInit;
- UErrorCode status = U_ZERO_ERROR;
- umtx_lock(NULL);
- needsInit = (UBool)(gLanguageBreakFactories == NULL);
- umtx_unlock(NULL);
-
- if (needsInit) {
- UStack *factories = new UStack(_deleteFactory, NULL, status);
- if (U_SUCCESS(status)) {
- ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
- factories->push(builtIn, status);
+static void U_CALLCONV initLanguageFactories() {
+ UErrorCode status = U_ZERO_ERROR;
+ U_ASSERT(gLanguageBreakFactories == NULL);
+ gLanguageBreakFactories = new UStack(_deleteFactory, NULL, status);
+ if (gLanguageBreakFactories != NULL && U_SUCCESS(status)) {
+ ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
+ gLanguageBreakFactories->push(builtIn, status);
#ifdef U_LOCAL_SERVICE_HOOK
- LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
- if (extra != NULL) {
- factories->push(extra, status);
- }
-#endif
- }
- umtx_lock(NULL);
- if (gLanguageBreakFactories == NULL) {
- gLanguageBreakFactories = factories;
- factories = NULL;
- ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict);
+ LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
+ if (extra != NULL) {
+ gLanguageBreakFactories->push(extra, status);
}
- umtx_unlock(NULL);
- delete factories;
+#endif
}
-
+ ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict);
+}
+
+
+static const LanguageBreakEngine*
+getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
+{
+ umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories);
if (gLanguageBreakFactories == NULL) {
return NULL;
}
//-------------------------------------------------------------------------------
//
// getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the
-// the characer c.
+// the character c.
//
//-------------------------------------------------------------------------------
const LanguageBreakEngine *
if (fLanguageBreakEngines == NULL) {
fLanguageBreakEngines = new UStack(status);
- if (U_FAILURE(status)) {
+ if (fLanguageBreakEngines == NULL || U_FAILURE(status)) {
delete fLanguageBreakEngines;
fLanguageBreakEngines = 0;
return NULL;