icuSources/common/rbtok.cpp

   1 /*
   2 ***************************************************************************
   3 *   Copyright (C) 2006 Apple Computer, Inc. All rights reserved.          *
   4 ***************************************************************************
   5
   6 */
   7
   8 #include "unicode/utypes.h"
   9
  10 #if !UCONFIG_NO_BREAK_ITERATION
  11
  12 #include "rbtok.h"
  13 #include "unicode/ustring.h"
  14 #include "unicode/utext.h"
  15 #include "rbbidata.h"
  16
  17 U_NAMESPACE_BEGIN
  18
  19
  20 #if defined(__GNUC__) && (__GNUC__ >= 4)
  21 #pragma GCC optimization_level 3
  22 #endif
  23
  24 static const int16_t START_STATE = 1;     // The state number of the starting state
  25 static const int16_t STOP_STATE  = 0;     // The state-transition value indicating "stop"
  26
  27 int32_t RuleBasedTokenizer::tokenize(int32_t maxTokens, RuleBasedTokenRange *outTokenRanges, unsigned long *outTokenFlags)
  28 {
  29     RuleBasedTokenRange *outTokenLimit = outTokenRanges + maxTokens;
  30     RuleBasedTokenRange *outTokenP = outTokenRanges;
  31     int32_t             state;
  32     int16_t             category;
  33
  34     const RBBIStateTableRow  *row;
  35     const RBBIStateTableRow  *const startRow = fStartRow;
  36
  37     int32_t             lastAcceptingState = 0;
  38     UChar32             c = 0;
  39     signed long         prev;
  40     signed long         result;
  41     const char         *const tableData       = fData->fForwardTable->fTableData;
  42     const uint32_t            tableRowLen     = fData->fForwardTable->fRowLen;
  43     UText *text = fText;
  44
  45     #ifdef RBBI_DEBUG
  46         if (fTrace) {
  47             RBBIDebugPuts("Handle Next   pos   char  state category");
  48         }
  49     #endif
  50
  51     fLastStatusIndexValid = FALSE;
  52
  53     // if we're already at the end of the text, return DONE.
  54     prev = (signed long)UTEXT_GETNATIVEINDEX(text);
  55
  56     // loop until we reach the end of the text or transition to state 0
  57     //
  58     const UTrie         *trie = &fData->fTrie;
  59     while (outTokenP < outTokenLimit) {
  60         c               = UTEXT_NEXT32(text);
  61         if (c == U_SENTINEL)
  62         {
  63             goto exitTokenizer;
  64         }
  65         //  Set the initial state for the state machine
  66         state = START_STATE;
  67         row = startRow;
  68
  69         // if we have cached break positions and we're still in the range
  70         // covered by them, just move one step forward in the cache
  71         if (fCachedBreakPositions != NULL) {
  72             if (fPositionInCache < fNumCachedBreakPositions - 1) {
  73                 ++fPositionInCache;
  74                 result = fCachedBreakPositions[fPositionInCache];
  75                 goto emitToken;
  76             }
  77             else {
  78                 reset();
  79             }
  80         }
  81
  82         while (c != U_SENTINEL) {
  83             //
  84             // Get the char category.  An incoming category of 1 or 2 means that
  85             //      we are preset for doing the beginning or end of input, and
  86             //      that we shouldn't get a category from an actual text input character.
  87             //
  88                 // look up the current character's character category, which tells us
  89                 // which column in the state table to look at.
  90                 // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
  91                 //        not the size of the character going in, which is a UChar32.
  92                 //
  93                 if (__builtin_expect((c < 0x100), 1))
  94                     category = fLatin1Cat[c];
  95                 else
  96                     UTRIE_GET16(trie, c, category);
  97
  98                 // Check the dictionary bit in the character's category.
  99                 //    Counter is only used by dictionary based iterators (subclasses).
 100                 //    Chars that need to be handled by a dictionary have a flag bit set
 101                 //    in their category values.
 102                 //
 103                 if (__builtin_expect((category & 0x4000) != 0, 0))  {
 104                     fDictionaryCharCount++;
 105                     //  And off the dictionary flag bit.
 106                     category &= ~0x4000;
 107                 }
 108
 109             #ifdef RBBI_DEBUG
 110                 if (fTrace) {
 111                     RBBIDebugPrintf("             %4d   ", utext_getNativeIndex(fText));
 112                     if (0x20<=c && c<0x7f) {
 113                         RBBIDebugPrintf("\"%c\"  ", c);
 114                     } else {
 115                         RBBIDebugPrintf("%5x  ", c);
 116                     }
 117                     RBBIDebugPrintf("%3d  %3d\n", state, category);
 118                 }
 119             #endif
 120
 121             // State Transition - move machine to its next state
 122             //
 123             state = row->fNextState[category];
 124             row = (const RBBIStateTableRow *) (tableData + tableRowLen * state);
 125
 126             if (row->fAccepting == -1) {
 127                 // Match found, common case.
 128                     result = (signed long)UTEXT_GETNATIVEINDEX(text);
 129                 //fLastRuleStatusIndex = row->fTagIdx;   // Remember the break status (tag) values.
 130                 //lastStatusRow = row;
 131                 lastAcceptingState = state;
 132             }
 133
 134             if (state == STOP_STATE) {
 135                 // This is the normal exit from the lookup state machine.
 136                 // We have advanced through the string until it is certain that no
 137                 //   longer match is possible, no matter what characters follow.
 138                 break;
 139             }
 140
 141             // Advance to the next character.
 142             // If this is a beginning-of-input loop iteration, don't advance
 143             //    the input position.  The next iteration will be processing the
 144             //    first real input character.
 145                 c = UTEXT_NEXT32(text);
 146         }
 147
 148         if (fDictionaryCharCount > 0) {
 149             result = (signed long) checkDictionary(prev, (int32_t) result, FALSE);
 150         }
 151
 152 emitToken:
 153         // The state machine is done.  Check whether it found a match...
 154
 155         // Leave the iterator at our result position.
 156         UTEXT_SETNATIVEINDEX(text, result);
 157
 158         RuleBasedTokenRange range = {(signed long)prev, (signed long) (result-prev)};
 159         int32_t flags = fStateFlags[lastAcceptingState];
 160
 161         if (flags == -1)
 162             goto skipToken;
 163
 164         *outTokenP++ = range;
 165         if (outTokenFlags)
 166         {
 167             *outTokenFlags++ = (unsigned long) flags;
 168         }
 169
 170         if (flags & 0x40000000)
 171             goto exitTokenizer;
 172
 173 skipToken:
 174         prev = result;
 175     }
 176
 177 exitTokenizer:
 178     return (outTokenP - outTokenRanges);
 179 }
 180
 181 #if defined (__GNUC__) && (__GNUC__ >= 4)
 182 #pragma GCC optimization_level reset
 183 #endif
 184
 185 void
 186 RuleBasedTokenizer::init()
 187 {
 188     const RBBIStateTable *statetable = fData->fForwardTable;
 189     setBreakType(UBRK_WORD);
 190     fStartRow = (const RBBIStateTableRow *)
 191         (statetable->fTableData + (statetable->fRowLen * START_STATE));
 192     UChar i;
 193     const UTrie         *trie = &fData->fTrie;
 194     int16_t category;
 195     fLatin1Cat = new int16_t[256];
 196     for (i = 0; i < 256; ++i)
 197     {
 198         //UTRIE_GET16(trie, i, category);
 199         //fLatin1Cat[i] = category;
 200         fLatin1Cat[i] = _UTRIE_GET_RAW(trie, index, 0, i);
 201     }
 202     fStateFlags = new int32_t[statetable->fNumStates];
 203     for (i = 0; i < statetable->fNumStates; ++i)
 204     {
 205         const RBBIStateTableRow *row = (const RBBIStateTableRow *)
 206             (statetable->fTableData + (statetable->fRowLen * i));
 207         int32_t flags = 0;
 208         if (row->fAccepting == -1)
 209         {
 210             const int32_t *vals = (fData->fRuleStatusTable) + (row->fTagIdx);
 211             const int32_t *valLimit = vals + 1;
 212             valLimit += *vals++;
 213             while (vals < valLimit)
 214             {
 215                 int32_t val = *vals++;
 216                 if (val == 0)
 217                 {
 218                     break;
 219                 }
 220                 else if (val > 0)
 221                 {
 222                     flags |= val;
 223                 }
 224                 else
 225                 {
 226                     flags = val;
 227                     break;
 228                 }
 229             }
 230         }
 231         fStateFlags[i] = flags;
 232     }
 233 }
 234
 235 RuleBasedTokenizer::RuleBasedTokenizer(const UnicodeString &rules, UParseError &parseErr, UErrorCode &err)
 236     : RuleBasedBreakIterator(rules, parseErr, err)
 237 {
 238     init();
 239 }
 240
 241 RuleBasedTokenizer::RuleBasedTokenizer(uint8_t *data, UErrorCode &status)
 242     : RuleBasedBreakIterator((RBBIDataHeader *)data, status)
 243 {
 244     init();
 245 }
 246
 247 RuleBasedTokenizer::~RuleBasedTokenizer() {
 248     delete [] fStateFlags;
 249     delete [] fLatin1Cat;
 250 }
 251
 252 U_NAMESPACE_END
 253
 254 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */