--- /dev/null
+/*
+***************************************************************************
+* Copyright (C) 2006 Apple Computer, Inc. All rights reserved. *
+***************************************************************************
+
+*/
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_BREAK_ITERATION
+
+#include "rbtok.h"
+#include "unicode/ustring.h"
+#include "unicode/utext.h"
+#include "rbbidata.h"
+
+U_NAMESPACE_BEGIN
+
+
+#if defined(__GNUC__) && (__GNUC__ >= 4)
+#pragma GCC optimization_level 3
+#endif
+
+static const int16_t START_STATE = 1; // The state number of the starting state
+static const int16_t STOP_STATE = 0; // The state-transition value indicating "stop"
+
+int32_t RuleBasedTokenizer::tokenize(int32_t maxTokens, RuleBasedTokenRange *outTokenRanges, unsigned long *outTokenFlags)
+{
+ RuleBasedTokenRange *outTokenLimit = outTokenRanges + maxTokens;
+ RuleBasedTokenRange *outTokenP = outTokenRanges;
+ int32_t state;
+ int16_t category;
+
+ const RBBIStateTableRow *row;
+ const RBBIStateTableRow *const startRow = fStartRow;
+
+ int32_t lastAcceptingState = 0;
+ UChar32 c = 0;
+ signed long prev;
+ signed long result;
+ const char *const tableData = fData->fForwardTable->fTableData;
+ const uint32_t tableRowLen = fData->fForwardTable->fRowLen;
+ UText *text = fText;
+
+ #ifdef RBBI_DEBUG
+ if (fTrace) {
+ RBBIDebugPuts("Handle Next pos char state category");
+ }
+ #endif
+
+ fLastStatusIndexValid = FALSE;
+
+ // if we're already at the end of the text, return DONE.
+ prev = (signed long)UTEXT_GETNATIVEINDEX(text);
+
+ // loop until we reach the end of the text or transition to state 0
+ //
+ const UTrie *trie = &fData->fTrie;
+ while (outTokenP < outTokenLimit) {
+ c = UTEXT_NEXT32(text);
+ if (c == U_SENTINEL)
+ {
+ goto exitTokenizer;
+ }
+ // Set the initial state for the state machine
+ state = START_STATE;
+ row = startRow;
+
+ // if we have cached break positions and we're still in the range
+ // covered by them, just move one step forward in the cache
+ if (fCachedBreakPositions != NULL) {
+ if (fPositionInCache < fNumCachedBreakPositions - 1) {
+ ++fPositionInCache;
+ result = fCachedBreakPositions[fPositionInCache];
+ goto emitToken;
+ }
+ else {
+ reset();
+ }
+ }
+
+ while (c != U_SENTINEL) {
+ //
+ // Get the char category. An incoming category of 1 or 2 means that
+ // we are preset for doing the beginning or end of input, and
+ // that we shouldn't get a category from an actual text input character.
+ //
+ // look up the current character's character category, which tells us
+ // which column in the state table to look at.
+ // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
+ // not the size of the character going in, which is a UChar32.
+ //
+ if (__builtin_expect((c < 0x100), 1))
+ category = fLatin1Cat[c];
+ else
+ UTRIE_GET16(trie, c, category);
+
+ // Check the dictionary bit in the character's category.
+ // Counter is only used by dictionary based iterators (subclasses).
+ // Chars that need to be handled by a dictionary have a flag bit set
+ // in their category values.
+ //
+ if (__builtin_expect((category & 0x4000) != 0, 0)) {
+ fDictionaryCharCount++;
+ // And off the dictionary flag bit.
+ category &= ~0x4000;
+ }
+
+ #ifdef RBBI_DEBUG
+ if (fTrace) {
+ RBBIDebugPrintf(" %4d ", utext_getNativeIndex(fText));
+ if (0x20<=c && c<0x7f) {
+ RBBIDebugPrintf("\"%c\" ", c);
+ } else {
+ RBBIDebugPrintf("%5x ", c);
+ }
+ RBBIDebugPrintf("%3d %3d\n", state, category);
+ }
+ #endif
+
+ // State Transition - move machine to its next state
+ //
+ state = row->fNextState[category];
+ row = (const RBBIStateTableRow *) (tableData + tableRowLen * state);
+
+ if (row->fAccepting == -1) {
+ // Match found, common case.
+ result = (signed long)UTEXT_GETNATIVEINDEX(text);
+ //fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values.
+ //lastStatusRow = row;
+ lastAcceptingState = state;
+ }
+
+ if (state == STOP_STATE) {
+ // This is the normal exit from the lookup state machine.
+ // We have advanced through the string until it is certain that no
+ // longer match is possible, no matter what characters follow.
+ break;
+ }
+
+ // Advance to the next character.
+ // If this is a beginning-of-input loop iteration, don't advance
+ // the input position. The next iteration will be processing the
+ // first real input character.
+ c = UTEXT_NEXT32(text);
+ }
+
+ if (fDictionaryCharCount > 0) {
+ result = (signed long) checkDictionary(prev, (int32_t) result, FALSE);
+ }
+
+emitToken:
+ // The state machine is done. Check whether it found a match...
+
+ // Leave the iterator at our result position.
+ UTEXT_SETNATIVEINDEX(text, result);
+
+ RuleBasedTokenRange range = {(signed long)prev, (signed long) (result-prev)};
+ int32_t flags = fStateFlags[lastAcceptingState];
+
+ if (flags == -1)
+ goto skipToken;
+
+ *outTokenP++ = range;
+ if (outTokenFlags)
+ {
+ *outTokenFlags++ = (unsigned long) flags;
+ }
+
+ if (flags & 0x40000000)
+ goto exitTokenizer;
+
+skipToken:
+ prev = result;
+ }
+
+exitTokenizer:
+ return (outTokenP - outTokenRanges);
+}
+
+#if defined (__GNUC__) && (__GNUC__ >= 4)
+#pragma GCC optimization_level reset
+#endif
+
+void
+RuleBasedTokenizer::init()
+{
+ const RBBIStateTable *statetable = fData->fForwardTable;
+ setBreakType(UBRK_WORD);
+ fStartRow = (const RBBIStateTableRow *)
+ (statetable->fTableData + (statetable->fRowLen * START_STATE));
+ UChar i;
+ const UTrie *trie = &fData->fTrie;
+ int16_t category;
+ fLatin1Cat = new int16_t[256];
+ for (i = 0; i < 256; ++i)
+ {
+ //UTRIE_GET16(trie, i, category);
+ //fLatin1Cat[i] = category;
+ fLatin1Cat[i] = _UTRIE_GET_RAW(trie, index, 0, i);
+ }
+ fStateFlags = new int32_t[statetable->fNumStates];
+ for (i = 0; i < statetable->fNumStates; ++i)
+ {
+ const RBBIStateTableRow *row = (const RBBIStateTableRow *)
+ (statetable->fTableData + (statetable->fRowLen * i));
+ int32_t flags = 0;
+ if (row->fAccepting == -1)
+ {
+ const int32_t *vals = (fData->fRuleStatusTable) + (row->fTagIdx);
+ const int32_t *valLimit = vals + 1;
+ valLimit += *vals++;
+ while (vals < valLimit)
+ {
+ int32_t val = *vals++;
+ if (val == 0)
+ {
+ break;
+ }
+ else if (val > 0)
+ {
+ flags |= val;
+ }
+ else
+ {
+ flags = val;
+ break;
+ }
+ }
+ }
+ fStateFlags[i] = flags;
+ }
+}
+
+RuleBasedTokenizer::RuleBasedTokenizer(const UnicodeString &rules, UParseError &parseErr, UErrorCode &err)
+ : RuleBasedBreakIterator(rules, parseErr, err)
+{
+ init();
+}
+
+RuleBasedTokenizer::RuleBasedTokenizer(uint8_t *data, UErrorCode &status)
+ : RuleBasedBreakIterator((RBBIDataHeader *)data, status)
+{
+ init();
+}
+
+RuleBasedTokenizer::~RuleBasedTokenizer() {
+ delete [] fStateFlags;
+ delete [] fLatin1Cat;
+}
+
+U_NAMESPACE_END
+
+#endif /* #if !UCONFIG_NO_BREAK_ITERATION */