2 ***************************************************************************
3 * Copyright (C) 2006-2008,2017-2018 Apple Inc. All Rights Reserved. *
4 ***************************************************************************
7 #include "unicode/utypes.h"
9 #if !UCONFIG_NO_BREAK_ITERATION
11 #include "unicode/ustring.h"
12 #include "unicode/utext.h"
13 #include "rbbidata57.h"
19 // The following is now static in rbbi.cpp, gets set dynamicaly.
20 // For now duplicate here to build, and force to TRUE if desired.
21 static UBool fTrace
= FALSE
;
27 static const int16_t START_STATE
= 1; // The state number of the starting state
28 static const int16_t STOP_STATE
= 0; // The state-transition value indicating "stop"
30 int32_t RuleBasedTokenizer::tokenize(int32_t maxTokens
, RuleBasedTokenRange
*outTokenRanges
, unsigned long *outTokenFlags
)
32 RuleBasedTokenRange
*outTokenLimit
= outTokenRanges
+ maxTokens
;
33 RuleBasedTokenRange
*outTokenP
= outTokenRanges
;
35 uint16_t category
= 0;
37 const RBBIStateTableRow
*row
;
38 const RBBIStateTableRow
*const startRow
= fStartRow
;
40 int32_t lastAcceptingState
= 0;
44 const char *const tableData
= fData
->fForwardTable
->fTableData
;
45 const uint32_t tableRowLen
= fData
->fForwardTable
->fRowLen
;
50 RBBIDebugPuts("Handle Next pos char state category");
54 fLastStatusIndexValid
= FALSE
;
56 // if we're already at the end of the text, return DONE.
57 prev
= (signed long)UTEXT_GETNATIVEINDEX(text
);
59 // loop until we reach the end of the text or transition to state 0
61 const UTrie
*trie
= &fData
->fTrie
;
62 while (outTokenP
< outTokenLimit
) {
63 result
= prev
; // fallback initialization
64 c
= UTEXT_NEXT32(text
);
69 // Set the initial state for the state machine
73 // if we have cached break positions and we're still in the range
74 // covered by them, just move one step forward in the cache
75 if (fCachedBreakPositions
!= NULL
) {
76 if (fPositionInCache
< fNumCachedBreakPositions
- 1) {
78 result
= fCachedBreakPositions
[fPositionInCache
];
86 while (c
!= U_SENTINEL
) {
88 // Get the char category. An incoming category of 1 or 2 means that
89 // we are preset for doing the beginning or end of input, and
90 // that we shouldn't get a category from an actual text input character.
92 // look up the current character's character category, which tells us
93 // which column in the state table to look at.
94 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
95 // not the size of the character going in, which is a UChar32.
98 category
= fLatin1Cat
[c
];
100 UTRIE_GET16(trie
, c
, category
);
102 // Check the dictionary bit in the character's category.
103 // Counter is only used by dictionary based iterators (subclasses).
104 // Chars that need to be handled by a dictionary have a flag bit set
105 // in their category values.
107 if ((category
& 0x4000) != 0) {
108 fDictionaryCharCount
++;
109 // And off the dictionary flag bit.
115 RBBIDebugPrintf(" %4lld ", utext_getNativeIndex(fText
));
116 if (0x20<=c
&& c
<0x7f) {
117 RBBIDebugPrintf("\"%c\" ", c
);
119 RBBIDebugPrintf("%5x ", c
);
121 RBBIDebugPrintf("%3d %3d\n", state
, category
);
125 // State Transition - move machine to its next state
128 // Note: fNextState is defined as uint16_t[2], but we are casting
129 // a generated RBBI table to RBBIStateTableRow and some tables
130 // actually have more than 2 categories.
131 U_ASSERT(category
<fData
->fHeader
->fCatCount
);
132 state
= row
->fNextState
[category
];
133 row
= (const RBBIStateTableRow
*) (tableData
+ tableRowLen
* state
);
135 if (row
->fAccepting
== -1) {
136 // Match found, common case.
137 result
= (signed long)UTEXT_GETNATIVEINDEX(text
);
138 //fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values.
139 //lastStatusRow = row;
140 lastAcceptingState
= state
;
143 if (state
== STOP_STATE
) {
144 // This is the normal exit from the lookup state machine.
145 // We have advanced through the string until it is certain that no
146 // longer match is possible, no matter what characters follow.
150 // Advance to the next character.
151 // If this is a beginning-of-input loop iteration, don't advance
152 // the input position. The next iteration will be processing the
153 // first real input character.
154 c
= UTEXT_NEXT32(text
);
157 if (fDictionaryCharCount
> 0) {
158 result
= (signed long) checkDictionary(prev
, (int32_t) result
, FALSE
);
162 // The state machine is done. Check whether it found a match...
164 // If the iterator failed to advance in the match engine, force it ahead by one.
165 // (This really indicates a defect in the break rules. They should always match
166 // at least one character.). Added in open-source ICU r13469
167 UBool setFlagsZero
= FALSE
;
168 if (result
== prev
) {
169 UTEXT_SETNATIVEINDEX(text
, prev
);
171 result
= (int32_t)UTEXT_GETNATIVEINDEX(text
);
175 // Leave the iterator at our result position.
176 UTEXT_SETNATIVEINDEX(text
, result
);
178 RuleBasedTokenRange range
= {(signed long)prev
, (signed long) (result
-prev
)};
179 int32_t flags
= (!setFlagsZero
)? fStateFlags
[lastAcceptingState
]: 0;
187 RBBIDebugPrintf("Emit location %3ld length %2ld flags %08X\n", range
.location
, range
.length
, flags
);
190 *outTokenP
++ = range
;
193 *outTokenFlags
++ = (unsigned long) flags
;
196 if (flags
& 0x40000000) {
205 return (outTokenP
- outTokenRanges
);
209 RuleBasedTokenizer::init()
211 const RBBIStateTable
*statetable
= fData
->fForwardTable
;
212 setBreakType(UBRK_WORD
);
213 fStartRow
= (const RBBIStateTableRow
*)
214 (statetable
->fTableData
+ (statetable
->fRowLen
* START_STATE
));
216 const UTrie
*trie
= &fData
->fTrie
;
218 fLatin1Cat
= new int16_t[256];
219 for (i
= 0; i
< 256; ++i
)
221 //UTRIE_GET16(trie, i, category);
222 //fLatin1Cat[i] = category;
223 fLatin1Cat
[i
] = _UTRIE_GET_RAW(trie
, index
, 0, i
);
225 fStateFlags
= new int32_t[statetable
->fNumStates
];
226 for (i
= 0; i
< statetable
->fNumStates
; ++i
)
228 const RBBIStateTableRow
*row
= (const RBBIStateTableRow
*)
229 (statetable
->fTableData
+ (statetable
->fRowLen
* i
));
231 if (row
->fAccepting
== -1 && row
->fTagIdx
!= 0)
233 const int32_t *vals
= (fData
->fRuleStatusTable
) + (row
->fTagIdx
);
234 const int32_t *valLimit
= vals
+ 1;
236 while (vals
< valLimit
)
238 int32_t val
= *vals
++;
254 fStateFlags
[i
] = flags
;
258 RuleBasedTokenizer::RuleBasedTokenizer(const UnicodeString
&rules
, UParseError
&parseErr
, UErrorCode
&err
)
259 : RuleBasedBreakIterator57(rules
, parseErr
, err
)
261 if (U_SUCCESS(err
)) {
266 RuleBasedTokenizer::RuleBasedTokenizer(uint8_t *data
, UErrorCode
&status
)
267 : RuleBasedBreakIterator57((RBBIDataHeader57
*)data
, status
)
269 if (U_SUCCESS(status
)) {
274 RuleBasedTokenizer::RuleBasedTokenizer(const uint8_t *data
, enum EDontAdopt
, UErrorCode
&status
)
275 : RuleBasedBreakIterator57((const RBBIDataHeader57
*)data
, RuleBasedBreakIterator57::kDontAdopt
, status
)
277 if (U_SUCCESS(status
)) {
282 RuleBasedTokenizer::~RuleBasedTokenizer() {
283 delete [] fStateFlags
;
284 delete [] fLatin1Cat
;
289 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */