ICU-8.11.tar.gz

[apple/icu.git] / icuSources / common / rbtok.cpp
diff --git a/icuSources/common/rbtok.cpp b/icuSources/common/rbtok.cpp

new file mode 100644 (file)

index 0000000..311afd2
--- /dev/null
+++ b/icuSources/common/rbtok.cpp
@@ -0,0 +1,254 @@
+/*
+***************************************************************************
+*   Copyright (C) 2006 Apple Computer, Inc. All rights reserved.          *
+***************************************************************************
+
+*/
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_BREAK_ITERATION
+
+#include "rbtok.h"
+#include "unicode/ustring.h"
+#include "unicode/utext.h"
+#include "rbbidata.h"
+
+U_NAMESPACE_BEGIN
+
+
+#if defined(__GNUC__) && (__GNUC__ >= 4)
+#pragma GCC optimization_level 3
+#endif
+
+static const int16_t START_STATE = 1;     // The state number of the starting state
+static const int16_t STOP_STATE  = 0;     // The state-transition value indicating "stop"
+
+int32_t RuleBasedTokenizer::tokenize(int32_t maxTokens, RuleBasedTokenRange *outTokenRanges, unsigned long *outTokenFlags)
+{
+    RuleBasedTokenRange *outTokenLimit = outTokenRanges + maxTokens;
+    RuleBasedTokenRange *outTokenP = outTokenRanges;
+    int32_t             state;
+    int16_t             category;
+    
+    const RBBIStateTableRow  *row;
+    const RBBIStateTableRow  *const startRow = fStartRow;
+
+    int32_t             lastAcceptingState = 0;
+    UChar32             c = 0;
+    signed long         prev;
+    signed long         result;
+    const char         *const tableData       = fData->fForwardTable->fTableData;
+    const uint32_t            tableRowLen     = fData->fForwardTable->fRowLen;
+    UText *text = fText;
+
+    #ifdef RBBI_DEBUG
+        if (fTrace) {
+            RBBIDebugPuts("Handle Next   pos   char  state category");
+        }
+    #endif
+
+    fLastStatusIndexValid = FALSE;
+
+    // if we're already at the end of the text, return DONE.
+    prev = (signed long)UTEXT_GETNATIVEINDEX(text);
+    
+    // loop until we reach the end of the text or transition to state 0
+    //
+    const UTrie         *trie = &fData->fTrie;
+    while (outTokenP < outTokenLimit) {
+        c               = UTEXT_NEXT32(text);
+        if (c == U_SENTINEL)
+        {
+            goto exitTokenizer;
+        }
+        //  Set the initial state for the state machine
+        state = START_STATE;
+        row = startRow;
+            
+        // if we have cached break positions and we're still in the range
+        // covered by them, just move one step forward in the cache
+        if (fCachedBreakPositions != NULL) {
+            if (fPositionInCache < fNumCachedBreakPositions - 1) {
+                ++fPositionInCache;
+                result = fCachedBreakPositions[fPositionInCache];
+                goto emitToken;
+            }
+            else {
+                reset();
+            }
+        }
+
+        while (c != U_SENTINEL) {
+            //
+            // Get the char category.  An incoming category of 1 or 2 means that
+            //      we are preset for doing the beginning or end of input, and
+            //      that we shouldn't get a category from an actual text input character.
+            //
+                // look up the current character's character category, which tells us
+                // which column in the state table to look at.
+                // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
+                //        not the size of the character going in, which is a UChar32.
+                //
+                if (__builtin_expect((c < 0x100), 1))
+                    category = fLatin1Cat[c];
+                else
+                    UTRIE_GET16(trie, c, category);
+    
+                // Check the dictionary bit in the character's category.
+                //    Counter is only used by dictionary based iterators (subclasses).
+                //    Chars that need to be handled by a dictionary have a flag bit set
+                //    in their category values.
+                //
+                if (__builtin_expect((category & 0x4000) != 0, 0))  {
+                    fDictionaryCharCount++;
+                    //  And off the dictionary flag bit.
+                    category &= ~0x4000;
+                }
+    
+            #ifdef RBBI_DEBUG
+                if (fTrace) {
+                    RBBIDebugPrintf("             %4d   ", utext_getNativeIndex(fText));
+                    if (0x20<=c && c<0x7f) {
+                        RBBIDebugPrintf("\"%c\"  ", c);
+                    } else {
+                        RBBIDebugPrintf("%5x  ", c);
+                    }
+                    RBBIDebugPrintf("%3d  %3d\n", state, category);
+                }
+            #endif
+    
+            // State Transition - move machine to its next state
+            //
+            state = row->fNextState[category];
+            row = (const RBBIStateTableRow *) (tableData + tableRowLen * state);
+    
+            if (row->fAccepting == -1) {
+                // Match found, common case.
+                    result = (signed long)UTEXT_GETNATIVEINDEX(text);
+                //fLastRuleStatusIndex = row->fTagIdx;   // Remember the break status (tag) values.
+                //lastStatusRow = row;
+                lastAcceptingState = state;
+            }
+    
+            if (state == STOP_STATE) {
+                // This is the normal exit from the lookup state machine.
+                // We have advanced through the string until it is certain that no
+                //   longer match is possible, no matter what characters follow.
+                break;
+            }
+            
+            // Advance to the next character.  
+            // If this is a beginning-of-input loop iteration, don't advance
+            //    the input position.  The next iteration will be processing the
+            //    first real input character.
+                c = UTEXT_NEXT32(text);
+        }
+
+        if (fDictionaryCharCount > 0) {
+            result = (signed long) checkDictionary(prev, (int32_t) result, FALSE);
+        }
+
+emitToken:
+        // The state machine is done.  Check whether it found a match...
+    
+        // Leave the iterator at our result position.
+        UTEXT_SETNATIVEINDEX(text, result);
+
+        RuleBasedTokenRange range = {(signed long)prev, (signed long) (result-prev)};
+        int32_t flags = fStateFlags[lastAcceptingState];
+
+        if (flags == -1)
+            goto skipToken;
+
+        *outTokenP++ = range;
+        if (outTokenFlags)
+        {
+            *outTokenFlags++ = (unsigned long) flags;
+        }
+
+        if (flags & 0x40000000)
+            goto exitTokenizer;
+
+skipToken:
+        prev = result;
+    }
+    
+exitTokenizer:
+    return (outTokenP - outTokenRanges);
+}
+
+#if defined (__GNUC__) && (__GNUC__ >= 4)
+#pragma GCC optimization_level reset
+#endif
+
+void
+RuleBasedTokenizer::init()
+{
+    const RBBIStateTable *statetable = fData->fForwardTable;
+    setBreakType(UBRK_WORD);
+    fStartRow = (const RBBIStateTableRow *)
+        (statetable->fTableData + (statetable->fRowLen * START_STATE));
+    UChar i;
+    const UTrie         *trie = &fData->fTrie;
+    int16_t category;
+    fLatin1Cat = new int16_t[256];
+    for (i = 0; i < 256; ++i)
+    {
+        //UTRIE_GET16(trie, i, category);
+        //fLatin1Cat[i] = category;
+        fLatin1Cat[i] = _UTRIE_GET_RAW(trie, index, 0, i);
+    }
+    fStateFlags = new int32_t[statetable->fNumStates];
+    for (i = 0; i < statetable->fNumStates; ++i)
+    {
+        const RBBIStateTableRow *row = (const RBBIStateTableRow *)
+            (statetable->fTableData + (statetable->fRowLen * i));
+        int32_t flags = 0;
+        if (row->fAccepting == -1)
+        {
+            const int32_t *vals = (fData->fRuleStatusTable) + (row->fTagIdx);
+            const int32_t *valLimit = vals + 1;
+            valLimit += *vals++;
+            while (vals < valLimit)
+            {
+                int32_t val = *vals++;
+                if (val == 0)
+                {
+                    break;
+                }
+                else if (val > 0)
+                {
+                    flags |= val;
+                }
+                else
+                {
+                    flags = val;
+                    break;
+                }
+            }
+        }
+        fStateFlags[i] = flags;
+    }
+}
+
+RuleBasedTokenizer::RuleBasedTokenizer(const UnicodeString &rules, UParseError &parseErr, UErrorCode &err)
+    : RuleBasedBreakIterator(rules, parseErr, err)
+{
+    init();
+}
+
+RuleBasedTokenizer::RuleBasedTokenizer(uint8_t *data, UErrorCode &status)
+    : RuleBasedBreakIterator((RBBIDataHeader *)data, status)
+{
+    init();
+}
+
+RuleBasedTokenizer::~RuleBasedTokenizer() {
+    delete [] fStateFlags;
+    delete [] fLatin1Cat;
+}
+
+U_NAMESPACE_END
+
+#endif /* #if !UCONFIG_NO_BREAK_ITERATION */