icuSources/common/rbtok.cpp

   1 /*
   2 ***************************************************************************
   3 * Copyright (C) 2006-2008,2017 Apple Inc. All Rights Reserved.            *
   4 ***************************************************************************
   5 */
   6
   7 #include "unicode/utypes.h"
   8
   9 #if !UCONFIG_NO_BREAK_ITERATION
  10
  11 #include "rbtok.h"
  12 #include "unicode/ustring.h"
  13 #include "unicode/utext.h"
  14 #include "rbbidata.h"
  15 #include "rbbirb.h"
  16 #include "uassert.h"
  17
  18 #ifdef RBBI_DEBUG
  19 // The following is now static in rbbi.cpp, gets set dynamicaly.
  20 // For now duplicate here to build, and force to TRUE if desired.
  21 static UBool fTrace = FALSE;
  22 #endif
  23
  24 U_NAMESPACE_BEGIN
  25
  26
  27 static const int16_t START_STATE = 1;     // The state number of the starting state
  28 static const int16_t STOP_STATE  = 0;     // The state-transition value indicating "stop"
  29
  30 int32_t RuleBasedTokenizer::tokenize(int32_t maxTokens, RuleBasedTokenRange *outTokenRanges, unsigned long *outTokenFlags)
  31 {
  32     RuleBasedTokenRange *outTokenLimit = outTokenRanges + maxTokens;
  33     RuleBasedTokenRange *outTokenP = outTokenRanges;
  34     int32_t             state;
  35     uint16_t            category = 0;
  36
  37     const RBBIStateTableRow  *row;
  38     const RBBIStateTableRow  *const startRow = fStartRow;
  39
  40     int32_t             lastAcceptingState = 0;
  41     UChar32             c = 0;
  42     signed long         prev;
  43     signed long         result;
  44     const char         *const tableData       = fData->fForwardTable->fTableData;
  45     const uint32_t            tableRowLen     = fData->fForwardTable->fRowLen;
  46     UText *text = fText;
  47
  48     #ifdef RBBI_DEBUG
  49         if (fTrace) {
  50             RBBIDebugPuts("Handle Next   pos   char  state category");
  51         }
  52     #endif
  53
  54     fLastStatusIndexValid = FALSE;
  55
  56     // if we're already at the end of the text, return DONE.
  57     prev = (signed long)UTEXT_GETNATIVEINDEX(text);
  58
  59     // loop until we reach the end of the text or transition to state 0
  60     //
  61     const UTrie *trie = &fData->fTrie;
  62     while (outTokenP < outTokenLimit) {
  63         // LookAheadResults lookAheadMatches; // added in RBBI, #12081/r38387
  64         result = prev; // fallback initialization, prevent uninitialized use
  65         c = UTEXT_NEXT32(text);
  66         if (c == U_SENTINEL)
  67         {
  68             goto exitTokenizer;
  69         }
  70         //  Set the initial state for the state machine
  71         state = START_STATE;
  72         row = startRow;
  73
  74         // if we have cached break positions and we're still in the range
  75         // covered by them, just move one step forward in the cache
  76         if (fCachedBreakPositions != NULL) {
  77             if (fPositionInCache < fNumCachedBreakPositions - 1) {
  78                 ++fPositionInCache;
  79                 result = fCachedBreakPositions[fPositionInCache];
  80                 goto emitToken;
  81             }
  82             else {
  83                 reset();
  84             }
  85         }
  86
  87         while (c != U_SENTINEL) {
  88             //
  89             // Get the char category.  An incoming category of 1 or 2 means that
  90             //      we are preset for doing the beginning or end of input, and
  91             //      that we shouldn't get a category from an actual text input character.
  92             //
  93                 // look up the current character's character category, which tells us
  94                 // which column in the state table to look at.
  95                 // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
  96                 //        not the size of the character going in, which is a UChar32.
  97                 //
  98                 if (c < 0x100)
  99                     category = fLatin1Cat[c];
 100                 else
 101                     UTRIE_GET16(trie, c, category);
 102
 103                 // Check the dictionary bit in the character's category.
 104                 //    Counter is only used by dictionary based iterators (subclasses).
 105                 //    Chars that need to be handled by a dictionary have a flag bit set
 106                 //    in their category values.
 107                 //
 108                 if ((category & 0x4000) != 0)  {
 109                     fDictionaryCharCount++;
 110                     //  And off the dictionary flag bit.
 111                     category &= ~0x4000;
 112                 }
 113
 114             #ifdef RBBI_DEBUG
 115                 if (fTrace) {
 116                     RBBIDebugPrintf("             %4lld   ", utext_getNativeIndex(fText));
 117                     if (0x20<=c && c<0x7f) {
 118                         RBBIDebugPrintf("\"%c\"  ", c);
 119                     } else {
 120                         RBBIDebugPrintf("%5x  ", c);
 121                     }
 122                     RBBIDebugPrintf("%3d  %3d\n", state, category);
 123                 }
 124             #endif
 125
 126             // State Transition - move machine to its next state
 127             //
 128
 129             // Note: fNextState is defined as uint16_t[2], but we are casting
 130             // a generated RBBI table to RBBIStateTableRow and some tables
 131             // actually have more than 2 categories.
 132             U_ASSERT(category<fData->fHeader->fCatCount);
 133             state = row->fNextState[category];
 134             row = (const RBBIStateTableRow *) (tableData + tableRowLen * state);
 135
 136             if (row->fAccepting == -1) {
 137                 // Match found, common case.
 138                     result = (signed long)UTEXT_GETNATIVEINDEX(text);
 139                 //fLastRuleStatusIndex = row->fTagIdx;   // Remember the break status (tag) values.
 140                 //lastStatusRow = row;
 141                 lastAcceptingState = state;
 142             }
 143
 144             // rbbi has added code here to check lookAheadMatches and
 145             // set lookAheadMatches, per open-source ICU #12081/r38387
 146
 147             if (state == STOP_STATE) {
 148                 // This is the normal exit from the lookup state machine.
 149                 // We have advanced through the string until it is certain that no
 150                 //   longer match is possible, no matter what characters follow.
 151                 break;
 152             }
 153
 154             // Advance to the next character.
 155             // If this is a beginning-of-input loop iteration, don't advance
 156             //    the input position.  The next iteration will be processing the
 157             //    first real input character.
 158                 c = UTEXT_NEXT32(text);
 159         }
 160
 161         if (fDictionaryCharCount > 0) {
 162             result = (signed long) checkDictionary(prev, (int32_t) result, FALSE);
 163         }
 164
 165 emitToken:
 166         // The state machine is done.  Check whether it found a match...
 167
 168         // If the iterator failed to advance in the match engine, force it ahead by one.
 169         //   (This really indicates a defect in the break rules.  They should always match
 170         //    at least one character.). Added in open-source ICU r13469
 171         UBool setFlagsZero = FALSE;
 172         if (result == prev) {
 173             UTEXT_SETNATIVEINDEX(text, prev);
 174             UTEXT_NEXT32(text);
 175             result = (int32_t)UTEXT_GETNATIVEINDEX(text);
 176             setFlagsZero = TRUE;
 177         }
 178
 179         // Leave the iterator at our result position.
 180         UTEXT_SETNATIVEINDEX(text, result);
 181
 182         RuleBasedTokenRange range = {(signed long)prev, (signed long) (result-prev)};
 183         int32_t flags = (!setFlagsZero)? fStateFlags[lastAcceptingState]: 0;
 184
 185         if (flags == -1) {
 186             goto skipToken;
 187         }
 188
 189     #ifdef RBBI_DEBUG
 190         if (fTrace) {
 191             RBBIDebugPrintf("Emit location %3ld length %2ld flags %08X\n", range.location, range.length, flags);
 192         }
 193     #endif
 194         *outTokenP++ = range;
 195         if (outTokenFlags)
 196         {
 197             *outTokenFlags++ = (unsigned long) flags;
 198         }
 199
 200         if (flags & 0x40000000) {
 201             goto exitTokenizer;
 202         }
 203
 204 skipToken:
 205         prev = result;
 206     }
 207
 208 exitTokenizer:
 209     return (outTokenP - outTokenRanges);
 210 }
 211
 212 void
 213 RuleBasedTokenizer::init()
 214 {
 215     const RBBIStateTable *statetable = fData->fForwardTable;
 216     setBreakType(UBRK_WORD);
 217     fStartRow = (const RBBIStateTableRow *)
 218         (statetable->fTableData + (statetable->fRowLen * START_STATE));
 219     UChar i;
 220     const UTrie         *trie = &fData->fTrie;
 221     //int16_t category;
 222     fLatin1Cat = new int16_t[256];
 223     for (i = 0; i < 256; ++i)
 224     {
 225         //UTRIE_GET16(trie, i, category);
 226         //fLatin1Cat[i] = category;
 227         fLatin1Cat[i] = _UTRIE_GET_RAW(trie, index, 0, i);
 228     }
 229     fStateFlags = new int32_t[statetable->fNumStates];
 230     for (i = 0; i < statetable->fNumStates; ++i)
 231     {
 232         const RBBIStateTableRow *row = (const RBBIStateTableRow *)
 233             (statetable->fTableData + (statetable->fRowLen * i));
 234         int32_t flags = 0;
 235         if (row->fAccepting == -1 && row->fTagIdx != 0)
 236         {
 237             const int32_t *vals = (fData->fRuleStatusTable) + (row->fTagIdx);
 238             const int32_t *valLimit = vals + 1;
 239             valLimit += *vals++;
 240             while (vals < valLimit)
 241             {
 242                 int32_t val = *vals++;
 243                 if (val == 0)
 244                 {
 245                     break;
 246                 }
 247                 else if (val > 0)
 248                 {
 249                     flags |= val;
 250                 }
 251                 else
 252                 {
 253                     flags = val;
 254                     break;
 255                 }
 256             }
 257         }
 258         fStateFlags[i] = flags;
 259     }
 260 }
 261
 262 RuleBasedTokenizer::RuleBasedTokenizer(const UnicodeString &rules, UParseError &parseErr, UErrorCode &err)
 263     : RuleBasedBreakIterator(rules, parseErr, err)
 264 {
 265     if (U_SUCCESS(err)) {
 266         init();
 267     }
 268 }
 269
 270 RuleBasedTokenizer::RuleBasedTokenizer(uint8_t *data, UErrorCode &status)
 271     : RuleBasedBreakIterator((RBBIDataHeader *)data, status)
 272 {
 273     if (U_SUCCESS(status)) {
 274         init();
 275     }
 276 }
 277
 278 RuleBasedTokenizer::RuleBasedTokenizer(const uint8_t *data, enum EDontAdopt, UErrorCode &status)
 279     : RuleBasedBreakIterator((const RBBIDataHeader *)data, RuleBasedBreakIterator::kDontAdopt, status)
 280 {
 281     if (U_SUCCESS(status)) {
 282         init();
 283     }
 284 }
 285
 286 RuleBasedTokenizer::~RuleBasedTokenizer() {
 287     delete [] fStateFlags;
 288     delete [] fLatin1Cat;
 289 }
 290
 291 U_NAMESPACE_END
 292
 293 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */