2 ***************************************************************************
3 * Copyright (C) 2006-2008,2017 Apple Inc. All Rights Reserved. *
4 ***************************************************************************
7 #include "unicode/utypes.h"
9 #if !UCONFIG_NO_BREAK_ITERATION
12 #include "unicode/ustring.h"
13 #include "unicode/utext.h"
19 // The following is now static in rbbi.cpp, gets set dynamicaly.
20 // For now duplicate here to build, and force to TRUE if desired.
21 static UBool fTrace
= FALSE
;
27 static const int16_t START_STATE
= 1; // The state number of the starting state
28 static const int16_t STOP_STATE
= 0; // The state-transition value indicating "stop"
30 int32_t RuleBasedTokenizer::tokenize(int32_t maxTokens
, RuleBasedTokenRange
*outTokenRanges
, unsigned long *outTokenFlags
)
32 RuleBasedTokenRange
*outTokenLimit
= outTokenRanges
+ maxTokens
;
33 RuleBasedTokenRange
*outTokenP
= outTokenRanges
;
35 uint16_t category
= 0;
37 const RBBIStateTableRow
*row
;
38 const RBBIStateTableRow
*const startRow
= fStartRow
;
40 int32_t lastAcceptingState
= 0;
44 const char *const tableData
= fData
->fForwardTable
->fTableData
;
45 const uint32_t tableRowLen
= fData
->fForwardTable
->fRowLen
;
50 RBBIDebugPuts("Handle Next pos char state category");
54 fLastStatusIndexValid
= FALSE
;
56 // if we're already at the end of the text, return DONE.
57 prev
= (signed long)UTEXT_GETNATIVEINDEX(text
);
59 // loop until we reach the end of the text or transition to state 0
61 const UTrie
*trie
= &fData
->fTrie
;
62 while (outTokenP
< outTokenLimit
) {
63 // LookAheadResults lookAheadMatches; // added in RBBI, #12081/r38387
64 c
= UTEXT_NEXT32(text
);
69 // Set the initial state for the state machine
73 // if we have cached break positions and we're still in the range
74 // covered by them, just move one step forward in the cache
75 if (fCachedBreakPositions
!= NULL
) {
76 if (fPositionInCache
< fNumCachedBreakPositions
- 1) {
78 result
= fCachedBreakPositions
[fPositionInCache
];
86 while (c
!= U_SENTINEL
) {
88 // Get the char category. An incoming category of 1 or 2 means that
89 // we are preset for doing the beginning or end of input, and
90 // that we shouldn't get a category from an actual text input character.
92 // look up the current character's character category, which tells us
93 // which column in the state table to look at.
94 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
95 // not the size of the character going in, which is a UChar32.
98 category
= fLatin1Cat
[c
];
100 UTRIE_GET16(trie
, c
, category
);
102 // Check the dictionary bit in the character's category.
103 // Counter is only used by dictionary based iterators (subclasses).
104 // Chars that need to be handled by a dictionary have a flag bit set
105 // in their category values.
107 if ((category
& 0x4000) != 0) {
108 fDictionaryCharCount
++;
109 // And off the dictionary flag bit.
115 RBBIDebugPrintf(" %4lld ", utext_getNativeIndex(fText
));
116 if (0x20<=c
&& c
<0x7f) {
117 RBBIDebugPrintf("\"%c\" ", c
);
119 RBBIDebugPrintf("%5x ", c
);
121 RBBIDebugPrintf("%3d %3d\n", state
, category
);
125 // State Transition - move machine to its next state
128 // Note: fNextState is defined as uint16_t[2], but we are casting
129 // a generated RBBI table to RBBIStateTableRow and some tables
130 // actually have more than 2 categories.
131 U_ASSERT(category
<fData
->fHeader
->fCatCount
);
132 state
= row
->fNextState
[category
];
133 row
= (const RBBIStateTableRow
*) (tableData
+ tableRowLen
* state
);
135 if (row
->fAccepting
== -1) {
136 // Match found, common case.
137 result
= (signed long)UTEXT_GETNATIVEINDEX(text
);
138 //fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values.
139 //lastStatusRow = row;
140 lastAcceptingState
= state
;
143 // rbbi has added code here to check lookAheadMatches and
144 // set lookAheadMatches, per open-source ICU #12081/r38387
146 if (state
== STOP_STATE
) {
147 // This is the normal exit from the lookup state machine.
148 // We have advanced through the string until it is certain that no
149 // longer match is possible, no matter what characters follow.
153 // Advance to the next character.
154 // If this is a beginning-of-input loop iteration, don't advance
155 // the input position. The next iteration will be processing the
156 // first real input character.
157 c
= UTEXT_NEXT32(text
);
160 if (fDictionaryCharCount
> 0) {
161 result
= (signed long) checkDictionary(prev
, (int32_t) result
, FALSE
);
165 // The state machine is done. Check whether it found a match...
167 // If the iterator failed to advance in the match engine, force it ahead by one.
168 // (This really indicates a defect in the break rules. They should always match
169 // at least one character.). Added in open-source ICU r13469
170 UBool setFlagsZero
= FALSE
;
171 if (result
== prev
) {
172 UTEXT_SETNATIVEINDEX(text
, prev
);
174 result
= (int32_t)UTEXT_GETNATIVEINDEX(text
);
178 // Leave the iterator at our result position.
179 UTEXT_SETNATIVEINDEX(text
, result
);
181 RuleBasedTokenRange range
= {(signed long)prev
, (signed long) (result
-prev
)};
182 int32_t flags
= (!setFlagsZero
)? fStateFlags
[lastAcceptingState
]: 0;
190 RBBIDebugPrintf("Emit location %3ld length %2ld flags %08X\n", range
.location
, range
.length
, flags
);
193 *outTokenP
++ = range
;
196 *outTokenFlags
++ = (unsigned long) flags
;
199 if (flags
& 0x40000000) {
208 return (outTokenP
- outTokenRanges
);
212 RuleBasedTokenizer::init()
214 const RBBIStateTable
*statetable
= fData
->fForwardTable
;
215 setBreakType(UBRK_WORD
);
216 fStartRow
= (const RBBIStateTableRow
*)
217 (statetable
->fTableData
+ (statetable
->fRowLen
* START_STATE
));
219 const UTrie
*trie
= &fData
->fTrie
;
221 fLatin1Cat
= new int16_t[256];
222 for (i
= 0; i
< 256; ++i
)
224 //UTRIE_GET16(trie, i, category);
225 //fLatin1Cat[i] = category;
226 fLatin1Cat
[i
] = _UTRIE_GET_RAW(trie
, index
, 0, i
);
228 fStateFlags
= new int32_t[statetable
->fNumStates
];
229 for (i
= 0; i
< statetable
->fNumStates
; ++i
)
231 const RBBIStateTableRow
*row
= (const RBBIStateTableRow
*)
232 (statetable
->fTableData
+ (statetable
->fRowLen
* i
));
234 if (row
->fAccepting
== -1 && row
->fTagIdx
!= 0)
236 const int32_t *vals
= (fData
->fRuleStatusTable
) + (row
->fTagIdx
);
237 const int32_t *valLimit
= vals
+ 1;
239 while (vals
< valLimit
)
241 int32_t val
= *vals
++;
257 fStateFlags
[i
] = flags
;
261 RuleBasedTokenizer::RuleBasedTokenizer(const UnicodeString
&rules
, UParseError
&parseErr
, UErrorCode
&err
)
262 : RuleBasedBreakIterator(rules
, parseErr
, err
)
264 if (U_SUCCESS(err
)) {
269 RuleBasedTokenizer::RuleBasedTokenizer(uint8_t *data
, UErrorCode
&status
)
270 : RuleBasedBreakIterator((RBBIDataHeader
*)data
, status
)
272 if (U_SUCCESS(status
)) {
277 RuleBasedTokenizer::RuleBasedTokenizer(const uint8_t *data
, enum EDontAdopt
, UErrorCode
&status
)
278 : RuleBasedBreakIterator((const RBBIDataHeader
*)data
, RuleBasedBreakIterator::kDontAdopt
, status
)
280 if (U_SUCCESS(status
)) {
285 RuleBasedTokenizer::~RuleBasedTokenizer() {
286 delete [] fStateFlags
;
287 delete [] fLatin1Cat
;
292 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */