icuSources/common/rbtok.cpp

   1 /*
   2 ***************************************************************************
   3 * Copyright (C) 2006-2008 Apple Inc. All Rights Reserved.                 *
   4 ***************************************************************************
   5 */
   6
   7 #include "unicode/utypes.h"
   8
   9 #if !UCONFIG_NO_BREAK_ITERATION
  10
  11 #include "rbtok.h"
  12 #include "unicode/ustring.h"
  13 #include "unicode/utext.h"
  14 #include "rbbidata.h"
  15
  16 U_NAMESPACE_BEGIN
  17
  18
  19 #if defined(__GNUC__) && (__GNUC__ >= 4)
  20 #pragma GCC optimization_level 3
  21 #endif
  22
  23 static const int16_t START_STATE = 1;     // The state number of the starting state
  24 static const int16_t STOP_STATE  = 0;     // The state-transition value indicating "stop"
  25
  26 int32_t RuleBasedTokenizer::tokenize(int32_t maxTokens, RuleBasedTokenRange *outTokenRanges, unsigned long *outTokenFlags)
  27 {
  28     RuleBasedTokenRange *outTokenLimit = outTokenRanges + maxTokens;
  29     RuleBasedTokenRange *outTokenP = outTokenRanges;
  30     int32_t             state;
  31     int16_t             category;
  32
  33     const RBBIStateTableRow  *row;
  34     const RBBIStateTableRow  *const startRow = fStartRow;
  35
  36     int32_t             lastAcceptingState = 0;
  37     UChar32             c = 0;
  38     signed long         prev;
  39     signed long         result;
  40     const char         *const tableData       = fData->fForwardTable->fTableData;
  41     const uint32_t            tableRowLen     = fData->fForwardTable->fRowLen;
  42     UText *text = fText;
  43
  44     #ifdef RBBI_DEBUG
  45         if (fTrace) {
  46             RBBIDebugPuts("Handle Next   pos   char  state category");
  47         }
  48     #endif
  49
  50     fLastStatusIndexValid = FALSE;
  51
  52     // if we're already at the end of the text, return DONE.
  53     prev = (signed long)UTEXT_GETNATIVEINDEX(text);
  54
  55     // loop until we reach the end of the text or transition to state 0
  56     //
  57     const UTrie         *trie = &fData->fTrie;
  58     while (outTokenP < outTokenLimit) {
  59         c               = UTEXT_NEXT32(text);
  60         if (c == U_SENTINEL)
  61         {
  62             goto exitTokenizer;
  63         }
  64         //  Set the initial state for the state machine
  65         state = START_STATE;
  66         row = startRow;
  67
  68         // if we have cached break positions and we're still in the range
  69         // covered by them, just move one step forward in the cache
  70         if (fCachedBreakPositions != NULL) {
  71             if (fPositionInCache < fNumCachedBreakPositions - 1) {
  72                 ++fPositionInCache;
  73                 result = fCachedBreakPositions[fPositionInCache];
  74                 goto emitToken;
  75             }
  76             else {
  77                 reset();
  78             }
  79         }
  80
  81         while (c != U_SENTINEL) {
  82             //
  83             // Get the char category.  An incoming category of 1 or 2 means that
  84             //      we are preset for doing the beginning or end of input, and
  85             //      that we shouldn't get a category from an actual text input character.
  86             //
  87                 // look up the current character's character category, which tells us
  88                 // which column in the state table to look at.
  89                 // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
  90                 //        not the size of the character going in, which is a UChar32.
  91                 //
  92                 if (c < 0x100)
  93                     category = fLatin1Cat[c];
  94                 else
  95                     UTRIE_GET16(trie, c, category);
  96
  97                 // Check the dictionary bit in the character's category.
  98                 //    Counter is only used by dictionary based iterators (subclasses).
  99                 //    Chars that need to be handled by a dictionary have a flag bit set
 100                 //    in their category values.
 101                 //
 102                 if ((category & 0x4000) != 0)  {
 103                     fDictionaryCharCount++;
 104                     //  And off the dictionary flag bit.
 105                     category &= ~0x4000;
 106                 }
 107
 108             #ifdef RBBI_DEBUG
 109                 if (fTrace) {
 110                     RBBIDebugPrintf("             %4d   ", utext_getNativeIndex(fText));
 111                     if (0x20<=c && c<0x7f) {
 112                         RBBIDebugPrintf("\"%c\"  ", c);
 113                     } else {
 114                         RBBIDebugPrintf("%5x  ", c);
 115                     }
 116                     RBBIDebugPrintf("%3d  %3d\n", state, category);
 117                 }
 118             #endif
 119
 120             // State Transition - move machine to its next state
 121             //
 122             state = row->fNextState[category];
 123             row = (const RBBIStateTableRow *) (tableData + tableRowLen * state);
 124
 125             if (row->fAccepting == -1) {
 126                 // Match found, common case.
 127                     result = (signed long)UTEXT_GETNATIVEINDEX(text);
 128                 //fLastRuleStatusIndex = row->fTagIdx;   // Remember the break status (tag) values.
 129                 //lastStatusRow = row;
 130                 lastAcceptingState = state;
 131             }
 132
 133             if (state == STOP_STATE) {
 134                 // This is the normal exit from the lookup state machine.
 135                 // We have advanced through the string until it is certain that no
 136                 //   longer match is possible, no matter what characters follow.
 137                 break;
 138             }
 139
 140             // Advance to the next character.
 141             // If this is a beginning-of-input loop iteration, don't advance
 142             //    the input position.  The next iteration will be processing the
 143             //    first real input character.
 144                 c = UTEXT_NEXT32(text);
 145         }
 146
 147         if (fDictionaryCharCount > 0) {
 148             result = (signed long) checkDictionary(prev, (int32_t) result, FALSE);
 149         }
 150
 151 emitToken:
 152         // The state machine is done.  Check whether it found a match...
 153
 154         // Leave the iterator at our result position.
 155         UTEXT_SETNATIVEINDEX(text, result);
 156
 157         RuleBasedTokenRange range = {(signed long)prev, (signed long) (result-prev)};
 158         int32_t flags = fStateFlags[lastAcceptingState];
 159
 160         if (flags == -1)
 161             goto skipToken;
 162
 163         *outTokenP++ = range;
 164         if (outTokenFlags)
 165         {
 166             *outTokenFlags++ = (unsigned long) flags;
 167         }
 168
 169         if (flags & 0x40000000)
 170             goto exitTokenizer;
 171
 172 skipToken:
 173         prev = result;
 174     }
 175
 176 exitTokenizer:
 177     return (outTokenP - outTokenRanges);
 178 }
 179
 180 #if defined (__GNUC__) && (__GNUC__ >= 4)
 181 #pragma GCC optimization_level reset
 182 #endif
 183
 184 void
 185 RuleBasedTokenizer::init()
 186 {
 187     const RBBIStateTable *statetable = fData->fForwardTable;
 188     setBreakType(UBRK_WORD);
 189     fStartRow = (const RBBIStateTableRow *)
 190         (statetable->fTableData + (statetable->fRowLen * START_STATE));
 191     UChar i;
 192     const UTrie         *trie = &fData->fTrie;
 193     int16_t category;
 194     fLatin1Cat = new int16_t[256];
 195     for (i = 0; i < 256; ++i)
 196     {
 197         //UTRIE_GET16(trie, i, category);
 198         //fLatin1Cat[i] = category;
 199         fLatin1Cat[i] = _UTRIE_GET_RAW(trie, index, 0, i);
 200     }
 201     fStateFlags = new int32_t[statetable->fNumStates];
 202     for (i = 0; i < statetable->fNumStates; ++i)
 203     {
 204         const RBBIStateTableRow *row = (const RBBIStateTableRow *)
 205             (statetable->fTableData + (statetable->fRowLen * i));
 206         int32_t flags = 0;
 207         if (row->fAccepting == -1)
 208         {
 209             const int32_t *vals = (fData->fRuleStatusTable) + (row->fTagIdx);
 210             const int32_t *valLimit = vals + 1;
 211             valLimit += *vals++;
 212             while (vals < valLimit)
 213             {
 214                 int32_t val = *vals++;
 215                 if (val == 0)
 216                 {
 217                     break;
 218                 }
 219                 else if (val > 0)
 220                 {
 221                     flags |= val;
 222                 }
 223                 else
 224                 {
 225                     flags = val;
 226                     break;
 227                 }
 228             }
 229         }
 230         fStateFlags[i] = flags;
 231     }
 232 }
 233
 234 RuleBasedTokenizer::RuleBasedTokenizer(const UnicodeString &rules, UParseError &parseErr, UErrorCode &err)
 235     : RuleBasedBreakIterator(rules, parseErr, err)
 236 {
 237     if (U_SUCCESS(err)) {
 238         init();
 239     }
 240 }
 241
 242 RuleBasedTokenizer::RuleBasedTokenizer(uint8_t *data, UErrorCode &status)
 243     : RuleBasedBreakIterator((RBBIDataHeader *)data, status)
 244 {
 245     if (U_SUCCESS(status)) {
 246         init();
 247     }
 248 }
 249
 250 RuleBasedTokenizer::RuleBasedTokenizer(const uint8_t *data, enum EDontAdopt, UErrorCode &status)
 251     : RuleBasedBreakIterator((const RBBIDataHeader *)data, RuleBasedBreakIterator::kDontAdopt, status)
 252 {
 253     if (U_SUCCESS(status)) {
 254         init();
 255     }
 256 }
 257
 258 RuleBasedTokenizer::~RuleBasedTokenizer() {
 259     delete [] fStateFlags;
 260     delete [] fLatin1Cat;
 261 }
 262
 263 U_NAMESPACE_END
 264
 265 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */