]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/rbbi.cpp
ICU-62135.0.1.tar.gz
[apple/icu.git] / icuSources / common / rbbi.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
b75a7d8f
A
3/*
4***************************************************************************
2ca993e8 5* Copyright (C) 1999-2016 International Business Machines Corporation
729e4ab9 6* and others. All rights reserved.
b75a7d8f
A
7***************************************************************************
8*/
374ca955 9//
0f5d89e8 10// file: rbbi.cpp Contains the implementation of the rule based break iterator
374ca955
A
11// runtime engine and the API implementation for
12// class RuleBasedBreakIterator
13//
b75a7d8f 14
51004dcb 15#include "utypeinfo.h" // for 'typeid' to work
729e4ab9 16
b75a7d8f
A
17#include "unicode/utypes.h"
18
19#if !UCONFIG_NO_BREAK_ITERATION
20
21#include "unicode/rbbi.h"
22#include "unicode/schriter.h"
73c04bcf 23#include "unicode/uchriter.h"
374ca955 24#include "unicode/uclean.h"
0f5d89e8
A
25#include "unicode/udata.h"
26
27#include "brkeng.h"
28#include "ucln_cmn.h"
b75a7d8f
A
29#include "cmemory.h"
30#include "cstring.h"
0f5d89e8
A
31#include "rbbidata.h"
32#include "rbbi_cache.h"
33#include "rbbirb.h"
b75a7d8f 34#include "uassert.h"
0f5d89e8
A
35#include "umutex.h"
36#include "uvectr32.h"
73c04bcf
A
37
38// if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included.
39#if U_LOCAL_SERVICE_HOOK
40#include "localsvc.h"
41#endif
42
0f5d89e8
A
43// Apple specific
44//#include <os/log.h>
45
73c04bcf 46#ifdef RBBI_DEBUG
0f5d89e8 47static UBool gTrace = FALSE;
73c04bcf 48#endif
b75a7d8f
A
49
50U_NAMESPACE_BEGIN
51
46f4442e 52// The state number of the starting state
0f5d89e8 53constexpr int32_t START_STATE = 1;
b75a7d8f 54
46f4442e 55// The state-transition value indicating "stop"
0f5d89e8 56constexpr int32_t STOP_STATE = 0;
b75a7d8f 57
374ca955
A
58
59UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
b75a7d8f
A
60
61
62//=======================================================================
63// constructors
64//=======================================================================
65
66/**
67 * Constructs a RuleBasedBreakIterator that uses the already-created
68 * tables object that is passed in as a parameter.
69 */
70RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
0f5d89e8 71 : fSCharIter(UnicodeString())
b75a7d8f 72{
0f5d89e8 73 init(status);
374ca955 74 fData = new RBBIDataWrapper(data, status); // status checked in constructor
b75a7d8f 75 if (U_FAILURE(status)) {return;}
b75a7d8f
A
76 if(fData == 0) {
77 status = U_MEMORY_ALLOCATION_ERROR;
78 return;
79 }
80}
81
4388f060
A
82//
83// Construct from precompiled binary rules (tables). This constructor is public API,
84// taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules().
85//
86RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
87 uint32_t ruleLength,
0f5d89e8
A
88 UErrorCode &status)
89 : fSCharIter(UnicodeString())
90{
91 init(status);
4388f060
A
92 if (U_FAILURE(status)) {
93 return;
94 }
95 if (compiledRules == NULL || ruleLength < sizeof(RBBIDataHeader)) {
96 status = U_ILLEGAL_ARGUMENT_ERROR;
97 return;
98 }
99 const RBBIDataHeader *data = (const RBBIDataHeader *)compiledRules;
100 if (data->fLength > ruleLength) {
101 status = U_ILLEGAL_ARGUMENT_ERROR;
102 return;
103 }
0f5d89e8 104 fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status);
4388f060
A
105 if (U_FAILURE(status)) {return;}
106 if(fData == 0) {
107 status = U_MEMORY_ALLOCATION_ERROR;
108 return;
109 }
0f5d89e8 110}
4388f060
A
111
112
b75a7d8f
A
113//-------------------------------------------------------------------------------
114//
115// Constructor from a UDataMemory handle to precompiled break rules
116// stored in an ICU data file.
117//
118//-------------------------------------------------------------------------------
119RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
0f5d89e8 120 : fSCharIter(UnicodeString())
b75a7d8f 121{
0f5d89e8 122 init(status);
374ca955 123 fData = new RBBIDataWrapper(udm, status); // status checked in constructor
b75a7d8f 124 if (U_FAILURE(status)) {return;}
b75a7d8f
A
125 if(fData == 0) {
126 status = U_MEMORY_ALLOCATION_ERROR;
127 return;
128 }
129}
130
131
132
133//-------------------------------------------------------------------------------
134//
135// Constructor from a set of rules supplied as a string.
136//
137//-------------------------------------------------------------------------------
138RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules,
139 UParseError &parseError,
140 UErrorCode &status)
0f5d89e8 141 : fSCharIter(UnicodeString())
b75a7d8f 142{
0f5d89e8 143 init(status);
b75a7d8f
A
144 if (U_FAILURE(status)) {return;}
145 RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
46f4442e 146 RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status);
b75a7d8f
A
147 // Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that
148 // creates and returns a complete RBBI. From here, in a constructor, we
149 // can't just return the object created by the builder factory, hence
150 // the assignment of the factory created object to "this".
151 if (U_SUCCESS(status)) {
152 *this = *bi;
153 delete bi;
154 }
155}
156
157
158//-------------------------------------------------------------------------------
159//
160// Default Constructor. Create an empty shell that can be set up later.
161// Used when creating a RuleBasedBreakIterator from a set
162// of rules.
163//-------------------------------------------------------------------------------
0f5d89e8
A
164RuleBasedBreakIterator::RuleBasedBreakIterator()
165 : fSCharIter(UnicodeString())
166{
167 UErrorCode status = U_ZERO_ERROR;
168 init(status);
b75a7d8f
A
169}
170
171
172//-------------------------------------------------------------------------------
173//
174// Copy constructor. Will produce a break iterator with the same behavior,
175// and which iterates over the same text, as the one passed in.
176//
177//-------------------------------------------------------------------------------
178RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
0f5d89e8
A
179: BreakIterator(other),
180 fSCharIter(UnicodeString())
b75a7d8f 181{
0f5d89e8
A
182 UErrorCode status = U_ZERO_ERROR;
183 this->init(status);
b75a7d8f
A
184 *this = other;
185}
186
187
188/**
189 * Destructor
190 */
191RuleBasedBreakIterator::~RuleBasedBreakIterator() {
0f5d89e8 192 if (fCharIter != &fSCharIter) {
73c04bcf
A
193 // fCharIter was adopted from the outside.
194 delete fCharIter;
195 }
196 fCharIter = NULL;
0f5d89e8
A
197
198 utext_close(&fText);
73c04bcf 199
b75a7d8f
A
200 if (fData != NULL) {
201 fData->removeReference();
202 fData = NULL;
203 }
0f5d89e8
A
204 delete fBreakCache;
205 fBreakCache = NULL;
206
207 delete fDictionaryCache;
208 fDictionaryCache = NULL;
209
210 delete fLanguageBreakEngines;
211 fLanguageBreakEngines = NULL;
212
213 delete fUnhandledBreakEngine;
214 fUnhandledBreakEngine = NULL;
215
216 delete [] fLatin1Cat;
217 fLatin1Cat = NULL;
b75a7d8f
A
218}
219
220/**
221 * Assignment operator. Sets this iterator to have the same behavior,
222 * and iterate over the same text, as the one passed in.
223 */
224RuleBasedBreakIterator&
225RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
226 if (this == &that) {
227 return *this;
228 }
0f5d89e8
A
229 BreakIterator::operator=(that);
230 fLineWordOpts = that.fLineWordOpts;
231
73c04bcf
A
232 if (fLanguageBreakEngines != NULL) {
233 delete fLanguageBreakEngines;
234 fLanguageBreakEngines = NULL; // Just rebuild for now
235 }
236 // TODO: clone fLanguageBreakEngines from "that"
237 UErrorCode status = U_ZERO_ERROR;
0f5d89e8 238 utext_clone(&fText, &that.fText, FALSE, TRUE, &status);
73c04bcf 239
0f5d89e8 240 if (fCharIter != &fSCharIter) {
73c04bcf
A
241 delete fCharIter;
242 }
0f5d89e8 243 fCharIter = &fSCharIter;
73c04bcf 244
0f5d89e8 245 if (that.fCharIter != NULL && that.fCharIter != &that.fSCharIter) {
73c04bcf
A
246 // This is a little bit tricky - it will intially appear that
247 // this->fCharIter is adopted, even if that->fCharIter was
248 // not adopted. That's ok.
249 fCharIter = that.fCharIter->clone();
b75a7d8f 250 }
0f5d89e8
A
251 fSCharIter = that.fSCharIter;
252 if (fCharIter == NULL) {
253 fCharIter = &fSCharIter;
254 }
b75a7d8f
A
255
256 if (fData != NULL) {
257 fData->removeReference();
258 fData = NULL;
259 }
260 if (that.fData != NULL) {
261 fData = that.fData->addReference();
262 }
b75a7d8f 263
0f5d89e8
A
264 delete [] fLatin1Cat;
265 fLatin1Cat = NULL;
266
267 fPosition = that.fPosition;
268 fRuleStatusIndex = that.fRuleStatusIndex;
269 fDone = that.fDone;
270
271 // TODO: both the dictionary and the main cache need to be copied.
272 // Current position could be within a dictionary range. Trying to continue
273 // the iteration without the caches present would go to the rules, with
274 // the assumption that the current position is on a rule boundary.
275 fBreakCache->reset(fPosition, fRuleStatusIndex);
276 fDictionaryCache->reset();
277
b75a7d8f
A
278 return *this;
279}
280
281
282
283//-----------------------------------------------------------------------------
284//
285// init() Shared initialization routine. Used by all the constructors.
286// Initializes all fields, leaving the object in a consistent state.
287//
288//-----------------------------------------------------------------------------
0f5d89e8 289void RuleBasedBreakIterator::init(UErrorCode &status) {
73c04bcf 290 fCharIter = NULL;
374ca955 291 fData = NULL;
0f5d89e8
A
292 fLatin1Cat = NULL;
293 fPosition = 0;
294 fRuleStatusIndex = 0;
295 fDone = false;
374ca955 296 fDictionaryCharCount = 0;
0f5d89e8
A
297 fLanguageBreakEngines = NULL;
298 fUnhandledBreakEngine = NULL;
299 fBreakCache = NULL;
300 fDictionaryCache = NULL;
73c04bcf 301
0f5d89e8
A
302 // Note: IBM xlC is unable to assign or initialize member fText from UTEXT_INITIALIZER.
303 // fText = UTEXT_INITIALIZER;
304 static const UText initializedUText = UTEXT_INITIALIZER;
305 uprv_memcpy(&fText, &initializedUText, sizeof(UText));
306
307 if (U_FAILURE(status)) {
308 return;
309 }
310
311 utext_openUChars(&fText, NULL, 0, &status);
312 fDictionaryCache = new DictionaryCache(this, status);
313 fBreakCache = new BreakCache(this, status);
314 if (U_SUCCESS(status) && (fDictionaryCache == NULL || fBreakCache == NULL)) {
315 status = U_MEMORY_ALLOCATION_ERROR;
316 }
b75a7d8f
A
317
318#ifdef RBBI_DEBUG
319 static UBool debugInitDone = FALSE;
320 if (debugInitDone == FALSE) {
321 char *debugEnv = getenv("U_RBBIDEBUG");
322 if (debugEnv && uprv_strstr(debugEnv, "trace")) {
0f5d89e8 323 gTrace = TRUE;
b75a7d8f
A
324 }
325 debugInitDone = TRUE;
326 }
327#endif
328}
329
330
0f5d89e8
A
331void RuleBasedBreakIterator::initLatin1Cat(void) {
332 fLatin1Cat = new uint16_t[256];
333 for (UChar32 c = 0; c < 256; ++c) {
334 fLatin1Cat[c] = UTRIE2_GET16(fData->fTrie, c);
335 }
336}
b75a7d8f
A
337
338//-----------------------------------------------------------------------------
339//
340// clone - Returns a newly-constructed RuleBasedBreakIterator with the same
341// behavior, and iterating over the same text, as this one.
342// Virtual function: does the right thing with subclasses.
343//
344//-----------------------------------------------------------------------------
345BreakIterator*
346RuleBasedBreakIterator::clone(void) const {
347 return new RuleBasedBreakIterator(*this);
348}
349
350/**
351 * Equality operator. Returns TRUE if both BreakIterators are of the
352 * same class, have the same behavior, and iterate over the same text.
353 */
354UBool
355RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
729e4ab9 356 if (typeid(*this) != typeid(that)) {
73c04bcf 357 return FALSE;
b75a7d8f 358 }
0f5d89e8
A
359 if (this == &that) {
360 return TRUE;
361 }
362
363 // The base class BreakIterator carries no state that participates in equality,
364 // and does not implement an equality function that would otherwise be
365 // checked at this point.
b75a7d8f
A
366
367 const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;
0f5d89e8 368 if (that2.fLineWordOpts != fLineWordOpts) {
2ca993e8
A
369 return FALSE;
370 }
73c04bcf 371
0f5d89e8 372 if (!utext_equals(&fText, &that2.fText)) {
73c04bcf 373 // The two break iterators are operating on different text,
0f5d89e8
A
374 // or have a different iteration position.
375 // Note that fText's position is always the same as the break iterator's position.
73c04bcf
A
376 return FALSE;
377 };
378
0f5d89e8
A
379 if (!(fPosition == that2.fPosition &&
380 fRuleStatusIndex == that2.fRuleStatusIndex &&
381 fDone == that2.fDone)) {
382 return FALSE;
383 }
73c04bcf
A
384
385 if (that2.fData == fData ||
386 (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) {
387 // The two break iterators are using the same rules.
388 return TRUE;
b75a7d8f 389 }
73c04bcf 390 return FALSE;
b75a7d8f
A
391}
392
393/**
394 * Compute a hash code for this BreakIterator
395 * @return A hash code
396 */
397int32_t
398RuleBasedBreakIterator::hashCode(void) const {
399 int32_t hash = 0;
400 if (fData != NULL) {
401 hash = fData->hashCode();
402 }
403 return hash;
404}
405
73c04bcf
A
406
407void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
408 if (U_FAILURE(status)) {
409 return;
410 }
0f5d89e8
A
411 fBreakCache->reset();
412 fDictionaryCache->reset();
413 utext_clone(&fText, ut, FALSE, TRUE, &status);
73c04bcf
A
414
415 // Set up a dummy CharacterIterator to be returned if anyone
416 // calls getText(). With input from UText, there is no reasonable
417 // way to return a characterIterator over the actual input text.
418 // Return one over an empty string instead - this is the closest
419 // we can come to signaling a failure.
420 // (GetText() is obsolete, this failure is sort of OK)
0f5d89e8 421 fSCharIter.setText(UnicodeString());
73c04bcf 422
0f5d89e8 423 if (fCharIter != &fSCharIter) {
73c04bcf
A
424 // existing fCharIter was adopted from the outside. Delete it now.
425 delete fCharIter;
426 }
0f5d89e8 427 fCharIter = &fSCharIter;
73c04bcf
A
428
429 this->first();
430}
431
432
433UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {
0f5d89e8 434 UText *result = utext_clone(fillIn, &fText, FALSE, TRUE, &status);
73c04bcf
A
435 return result;
436}
437
438
b75a7d8f
A
439//=======================================================================
440// BreakIterator overrides
441//=======================================================================
442
443/**
0f5d89e8 444 * Return a CharacterIterator over the text being analyzed.
b75a7d8f 445 */
73c04bcf 446CharacterIterator&
b75a7d8f 447RuleBasedBreakIterator::getText() const {
73c04bcf 448 return *fCharIter;
b75a7d8f
A
449}
450
451/**
452 * Set the iterator to analyze a new piece of text. This function resets
453 * the current iteration position to the beginning of the text.
454 * @param newText An iterator over the text to analyze.
455 */
456void
457RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
0f5d89e8 458 // If we are holding a CharacterIterator adopted from a
73c04bcf 459 // previous call to this function, delete it now.
0f5d89e8 460 if (fCharIter != &fSCharIter) {
73c04bcf
A
461 delete fCharIter;
462 }
463
464 fCharIter = newText;
465 UErrorCode status = U_ZERO_ERROR;
0f5d89e8
A
466 fBreakCache->reset();
467 fDictionaryCache->reset();
468 if (newText==NULL || newText->startIndex() != 0) {
73c04bcf
A
469 // startIndex !=0 wants to be an error, but there's no way to report it.
470 // Make the iterator text be an empty string.
0f5d89e8 471 utext_openUChars(&fText, NULL, 0, &status);
73c04bcf 472 } else {
0f5d89e8 473 utext_openCharacterIterator(&fText, newText, &status);
73c04bcf 474 }
b75a7d8f
A
475 this->first();
476}
477
478/**
479 * Set the iterator to analyze a new piece of text. This function resets
480 * the current iteration position to the beginning of the text.
481 * @param newText An iterator over the text to analyze.
482 */
483void
484RuleBasedBreakIterator::setText(const UnicodeString& newText) {
73c04bcf 485 UErrorCode status = U_ZERO_ERROR;
0f5d89e8
A
486 fBreakCache->reset();
487 fDictionaryCache->reset();
488 utext_openConstUnicodeString(&fText, &newText, &status);
73c04bcf 489
0f5d89e8 490 // Set up a character iterator on the string.
73c04bcf
A
491 // Needed in case someone calls getText().
492 // Can not, unfortunately, do this lazily on the (probably never)
493 // call to getText(), because getText is const.
0f5d89e8 494 fSCharIter.setText(newText);
73c04bcf 495
0f5d89e8 496 if (fCharIter != &fSCharIter) {
73c04bcf
A
497 // old fCharIter was adopted from the outside. Delete it.
498 delete fCharIter;
b75a7d8f 499 }
0f5d89e8 500 fCharIter = &fSCharIter;
73c04bcf 501
b75a7d8f
A
502 this->first();
503}
504
505
4388f060
A
506/**
507 * Provide a new UText for the input text. Must reference text with contents identical
508 * to the original.
509 * Intended for use with text data originating in Java (garbage collected) environments
510 * where the data may be moved in memory at arbitrary times.
511 */
512RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) {
513 if (U_FAILURE(status)) {
514 return *this;
515 }
516 if (input == NULL) {
517 status = U_ILLEGAL_ARGUMENT_ERROR;
518 return *this;
519 }
0f5d89e8 520 int64_t pos = utext_getNativeIndex(&fText);
4388f060 521 // Shallow read-only clone of the new UText into the existing input UText
0f5d89e8 522 utext_clone(&fText, input, FALSE, TRUE, &status);
4388f060
A
523 if (U_FAILURE(status)) {
524 return *this;
525 }
0f5d89e8
A
526 utext_setNativeIndex(&fText, pos);
527 if (utext_getNativeIndex(&fText) != pos) {
4388f060
A
528 // Sanity check. The new input utext is supposed to have the exact same
529 // contents as the old. If we can't set to the same position, it doesn't.
530 // The contents underlying the old utext might be invalid at this point,
531 // so it's not safe to check directly.
532 status = U_ILLEGAL_ARGUMENT_ERROR;
533 }
534 return *this;
535}
536
b75a7d8f
A
537
538/**
b331163b
A
539 * Sets the current iteration position to the beginning of the text, position zero.
540 * @return The new iterator position, which is zero.
b75a7d8f
A
541 */
542int32_t RuleBasedBreakIterator::first(void) {
0f5d89e8
A
543 UErrorCode status = U_ZERO_ERROR;
544 if (!fBreakCache->seek(0)) {
545 fBreakCache->populateNear(0, status);
546 }
547 fBreakCache->current();
548 U_ASSERT(fPosition == 0);
73c04bcf 549 return 0;
b75a7d8f
A
550}
551
552/**
553 * Sets the current iteration position to the end of the text.
b75a7d8f
A
554 * @return The text's past-the-end offset.
555 */
556int32_t RuleBasedBreakIterator::last(void) {
0f5d89e8
A
557 int32_t endPos = (int32_t)utext_nativeLength(&fText);
558 UBool endShouldBeBoundary = isBoundary(endPos); // Has side effect of setting iterator position.
559 (void)endShouldBeBoundary;
560 U_ASSERT(endShouldBeBoundary);
561 U_ASSERT(fPosition == endPos);
562 return endPos;
b75a7d8f
A
563}
564
565/**
566 * Advances the iterator either forward or backward the specified number of steps.
567 * Negative values move backward, and positive values move forward. This is
568 * equivalent to repeatedly calling next() or previous().
569 * @param n The number of steps to move. The sign indicates the direction
570 * (negative is backwards, and positive is forwards).
571 * @return The character offset of the boundary position n boundaries away from
572 * the current one.
573 */
574int32_t RuleBasedBreakIterator::next(int32_t n) {
0f5d89e8
A
575 int32_t result = 0;
576 if (n > 0) {
577 for (; n > 0 && result != UBRK_DONE; --n) {
578 result = next();
579 }
580 } else if (n < 0) {
581 for (; n < 0 && result != UBRK_DONE; ++n) {
582 result = previous();
583 }
584 } else {
585 result = current();
b75a7d8f
A
586 }
587 return result;
588}
589
590/**
591 * Advances the iterator to the next boundary position.
592 * @return The position of the first boundary after this one.
593 */
594int32_t RuleBasedBreakIterator::next(void) {
0f5d89e8
A
595 fBreakCache->next();
596 return fDone ? UBRK_DONE : fPosition;
b75a7d8f
A
597}
598
599/**
0f5d89e8
A
600 * Move the iterator backwards, to the boundary preceding the current one.
601 *
602 * Starts from the current position within fText.
603 * Starting position need not be on a boundary.
604 *
605 * @return The position of the boundary position immediately preceding the starting position.
b75a7d8f
A
606 */
607int32_t RuleBasedBreakIterator::previous(void) {
0f5d89e8
A
608 UErrorCode status = U_ZERO_ERROR;
609 fBreakCache->previous(status);
610 return fDone ? UBRK_DONE : fPosition;
b75a7d8f
A
611}
612
b75a7d8f
A
613/**
614 * Sets the iterator to refer to the first boundary position following
615 * the specified position.
0f5d89e8 616 * @param startPos The position from which to begin searching for a break position.
b75a7d8f
A
617 * @return The position of the first break after the current position.
618 */
0f5d89e8
A
619int32_t RuleBasedBreakIterator::following(int32_t startPos) {
620 // if the supplied position is before the beginning, return the
b331163b 621 // text's starting offset
0f5d89e8 622 if (startPos < 0) {
b331163b
A
623 return first();
624 }
625
626 // Move requested offset to a code point start. It might be on a trail surrogate,
0f5d89e8
A
627 // or on a trail byte if the input is UTF-8. Or it may be beyond the end of the text.
628 utext_setNativeIndex(&fText, startPos);
629 startPos = (int32_t)utext_getNativeIndex(&fText);
b75a7d8f 630
0f5d89e8
A
631 UErrorCode status = U_ZERO_ERROR;
632 fBreakCache->following(startPos, status);
633 return fDone ? UBRK_DONE : fPosition;
b75a7d8f
A
634}
635
636/**
637 * Sets the iterator to refer to the last boundary position before the
638 * specified position.
0f5d89e8 639 * @param offset The position to begin searching for a break from.
b75a7d8f
A
640 * @return The position of the last boundary before the starting position.
641 */
642int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
0f5d89e8 643 if (offset > utext_nativeLength(&fText)) {
b331163b
A
644 return last();
645 }
b331163b
A
646
647 // Move requested offset to a code point start. It might be on a trail surrogate,
648 // or on a trail byte if the input is UTF-8.
73c04bcf 649
0f5d89e8
A
650 utext_setNativeIndex(&fText, offset);
651 int32_t adjustedOffset = utext_getNativeIndex(&fText);
374ca955 652
0f5d89e8
A
653 UErrorCode status = U_ZERO_ERROR;
654 fBreakCache->preceding(adjustedOffset, status);
655 return fDone ? UBRK_DONE : fPosition;
b75a7d8f
A
656}
657
658/**
659 * Returns true if the specfied position is a boundary position. As a side
660 * effect, leaves the iterator pointing to the first boundary position at
661 * or after "offset".
0f5d89e8 662 *
b75a7d8f
A
663 * @param offset the offset to check.
664 * @return True if "offset" is a boundary position.
665 */
666UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
b75a7d8f 667 // out-of-range indexes are never boundary positions
73c04bcf 668 if (offset < 0) {
b75a7d8f
A
669 first(); // For side effects on current position, tag values.
670 return FALSE;
671 }
672
0f5d89e8
A
673 // Adjust offset to be on a code point boundary and not beyond the end of the text.
674 // Note that isBoundary() is always false for offsets that are not on code point boundaries.
675 // But we still need the side effect of leaving iteration at the following boundary.
676
677 utext_setNativeIndex(&fText, offset);
678 int32_t adjustedOffset = utext_getNativeIndex(&fText);
679
680 bool result = false;
681 UErrorCode status = U_ZERO_ERROR;
682 if (fBreakCache->seek(adjustedOffset) || fBreakCache->populateNear(adjustedOffset, status)) {
683 result = (fBreakCache->current() == offset);
b75a7d8f
A
684 }
685
0f5d89e8
A
686 if (result && adjustedOffset < offset && utext_char32At(&fText, offset) == U_SENTINEL) {
687 // Original offset is beyond the end of the text. Return FALSE, it's not a boundary,
688 // but the iteration position remains set to the end of the text, which is a boundary.
689 return FALSE;
690 }
691 if (!result) {
692 // Not on a boundary. isBoundary() must leave iterator on the following boundary.
693 // Cache->seek(), above, left us on the preceding boundary, so advance one.
694 next();
695 }
73c04bcf 696 return result;
b75a7d8f
A
697}
698
0f5d89e8 699
b75a7d8f
A
700/**
701 * Returns the current iteration position.
702 * @return The current iteration position.
703 */
704int32_t RuleBasedBreakIterator::current(void) const {
0f5d89e8 705 return fPosition;
b75a7d8f 706}
0f5d89e8
A
707
708
b75a7d8f
A
709//=======================================================================
710// implementation
711//=======================================================================
712
73c04bcf
A
713//
714// RBBIRunMode - the state machine runs an extra iteration at the beginning and end
715// of user text. A variable with this enum type keeps track of where we
716// are. The state machine only fetches user input while in the RUN mode.
717//
718enum RBBIRunMode {
719 RBBI_START, // state machine processing is before first char of input
720 RBBI_RUN, // state machine processing is in the user text
721 RBBI_END // state machine processing is after end of user text.
722};
723
b75a7d8f 724
2ca993e8
A
725// Map from look-ahead break states (corresponds to rules) to boundary positions.
726// Allows multiple lookahead break rules to be in flight at the same time.
727//
728// This is a temporary approach for ICU 57. A better fix is to make the look-ahead numbers
729// in the state table be sequential, then we can just index an array. And the
730// table could also tell us in advance how big that array needs to be.
731//
732// Before ICU 57 there was just a single simple variable for a look-ahead match that
733// was in progress. Two rules at once did not work.
734
735static const int32_t kMaxLookaheads = 8;
736struct LookAheadResults {
737 int32_t fUsedSlotLimit;
738 int32_t fPositions[8];
739 int16_t fKeys[8];
740
741 LookAheadResults() : fUsedSlotLimit(0), fPositions(), fKeys() {};
742
743 int32_t getPosition(int16_t key) {
744 for (int32_t i=0; i<fUsedSlotLimit; ++i) {
745 if (fKeys[i] == key) {
746 return fPositions[i];
747 }
748 }
0f5d89e8
A
749 // with NLLT source rules, Latn sample and ubrk_next, we see a request for key 79 here
750 // near the end of text, when setPosition has only ever set positions for key 80 or 82.
751 //U_ASSERT(FALSE);
2ca993e8
A
752 return -1;
753 }
754
755 void setPosition(int16_t key, int32_t position) {
756 int32_t i;
757 for (i=0; i<fUsedSlotLimit; ++i) {
758 if (fKeys[i] == key) {
759 fPositions[i] = position;
760 return;
761 }
762 }
763 if (i >= kMaxLookaheads) {
764 U_ASSERT(FALSE);
765 i = kMaxLookaheads - 1;
766 }
767 fKeys[i] = key;
768 fPositions[i] = position;
769 U_ASSERT(fUsedSlotLimit == i);
770 fUsedSlotLimit = i + 1;
771 }
772};
773
774
b75a7d8f
A
775//-----------------------------------------------------------------------------------
776//
0f5d89e8
A
777// handleNext()
778// Run the state machine to find a boundary
b75a7d8f
A
779//
780//-----------------------------------------------------------------------------------
0f5d89e8
A
781// Route handleNext calls through the following to handleNextInternal,
782// in order to handle fLineWordOpts.
783int32_t RuleBasedBreakIterator::handleNext() {
784 int32_t result = handleNextInternal();
785 while (fLineWordOpts != UBRK_LINEWORD_NORMAL) {
786 UChar32 prevChr = utext_char32At(&fText, result-1);
787 UChar32 currChr = utext_char32At(&fText, result);
788 if (currChr == U_SENTINEL || prevChr == U_SENTINEL) {
789 break;
790 }
791 if (fLineWordOpts == UBRK_LINEWORD_KEEP_HANGUL) {
792 UErrorCode status = U_ZERO_ERROR;
793 if (uscript_getScript(currChr, &status) != USCRIPT_HANGUL || uscript_getScript(prevChr, &status) != USCRIPT_HANGUL) {
794 break;
795 }
796 } else {
797 if (!u_isalpha(currChr) || !u_isalpha(prevChr)) {
798 break;
799 }
800 }
801 int32_t nextResult = handleNextInternal();
802 if (nextResult <= result) {
803 break;
804 }
805 result = nextResult;
806 }
807 return result;
808}
809
810int32_t RuleBasedBreakIterator::handleNextInternal() {
73c04bcf 811 int32_t state;
4388f060 812 uint16_t category = 0;
73c04bcf 813 RBBIRunMode mode;
0f5d89e8 814
73c04bcf
A
815 RBBIStateTableRow *row;
816 UChar32 c;
2ca993e8
A
817 LookAheadResults lookAheadMatches;
818 int32_t result = 0;
819 int32_t initialPosition = 0;
0f5d89e8 820 const RBBIStateTable *statetable = fData->fForwardTable;
2ca993e8
A
821 const char *tableData = statetable->fTableData;
822 uint32_t tableRowLen = statetable->fRowLen;
73c04bcf 823 #ifdef RBBI_DEBUG
0f5d89e8 824 if (gTrace) {
73c04bcf
A
825 RBBIDebugPuts("Handle Next pos char state category");
826 }
827 #endif
b75a7d8f 828
0f5d89e8
A
829 // handleNext alway sets the break tag value.
830 // Set the default for it.
831 fRuleStatusIndex = 0;
832
833 fDictionaryCharCount = 0;
b75a7d8f
A
834
835 // if we're already at the end of the text, return DONE.
0f5d89e8
A
836 initialPosition = fPosition;
837 UTEXT_SETNATIVEINDEX(&fText, initialPosition);
73c04bcf 838 result = initialPosition;
0f5d89e8
A
839 c = UTEXT_NEXT32(&fText);
840 if (c==U_SENTINEL) {
841 fDone = TRUE;
842 return UBRK_DONE;
b75a7d8f
A
843 }
844
73c04bcf
A
845 // Set the initial state for the state machine
846 state = START_STATE;
847 row = (RBBIStateTableRow *)
848 //(statetable->fTableData + (statetable->fRowLen * state));
849 (tableData + tableRowLen * state);
0f5d89e8
A
850
851
73c04bcf
A
852 mode = RBBI_RUN;
853 if (statetable->fFlags & RBBI_BOF_REQUIRED) {
854 category = 2;
855 mode = RBBI_START;
856 }
b75a7d8f 857
b75a7d8f
A
858
859 // loop until we reach the end of the text or transition to state 0
73c04bcf 860 //
b75a7d8f 861 for (;;) {
73c04bcf 862 if (c == U_SENTINEL) {
374ca955 863 // Reached end of input string.
73c04bcf 864 if (mode == RBBI_END) {
0f5d89e8 865 // We have already run the loop one last time with the
73c04bcf
A
866 // character set to the psueudo {eof} value. Now it is time
867 // to unconditionally bail out.
73c04bcf 868 break;
374ca955 869 }
73c04bcf
A
870 // Run the loop one last time with the fake end-of-input character category.
871 mode = RBBI_END;
872 category = 1;
b75a7d8f 873 }
b75a7d8f 874
b75a7d8f 875 //
73c04bcf
A
876 // Get the char category. An incoming category of 1 or 2 means that
877 // we are preset for doing the beginning or end of input, and
878 // that we shouldn't get a category from an actual text input character.
879 //
880 if (mode == RBBI_RUN) {
881 // look up the current character's character category, which tells us
882 // which column in the state table to look at.
883 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
884 // not the size of the character going in, which is a UChar32.
885 //
0f5d89e8 886 category = (fLatin1Cat!=NULL && c<0x100)? fLatin1Cat[c]: UTRIE2_GET16(fData->fTrie, c);
73c04bcf
A
887
888 // Check the dictionary bit in the character's category.
0f5d89e8 889 // Counter is only used by dictionary based iteration.
73c04bcf
A
890 // Chars that need to be handled by a dictionary have a flag bit set
891 // in their category values.
892 //
893 if ((category & 0x4000) != 0) {
894 fDictionaryCharCount++;
895 // And off the dictionary flag bit.
896 category &= ~0x4000;
897 }
b75a7d8f
A
898 }
899
4388f060 900 #ifdef RBBI_DEBUG
0f5d89e8
A
901 if (gTrace) {
902 RBBIDebugPrintf(" %4ld ", utext_getNativeIndex(&fText));
374ca955
A
903 if (0x20<=c && c<0x7f) {
904 RBBIDebugPrintf("\"%c\" ", c);
905 } else {
906 RBBIDebugPrintf("%5x ", c);
907 }
908 RBBIDebugPrintf("%3d %3d\n", state, category);
b75a7d8f 909 }
374ca955 910 #endif
b75a7d8f 911
73c04bcf
A
912 // State Transition - move machine to its next state
913 //
4388f060 914
0f5d89e8 915 // fNextState is a variable-length array.
4388f060
A
916 U_ASSERT(category<fData->fHeader->fCatCount);
917 state = row->fNextState[category]; /*Not accessing beyond memory*/
b75a7d8f 918 row = (RBBIStateTableRow *)
73c04bcf
A
919 // (statetable->fTableData + (statetable->fRowLen * state));
920 (tableData + tableRowLen * state);
b75a7d8f 921
b75a7d8f 922
b75a7d8f 923 if (row->fAccepting == -1) {
73c04bcf
A
924 // Match found, common case.
925 if (mode != RBBI_START) {
0f5d89e8 926 result = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
73c04bcf 927 }
0f5d89e8 928 fRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values.
b75a7d8f
A
929 }
930
2ca993e8
A
931 int16_t completedRule = row->fAccepting;
932 if (completedRule > 0) {
0f5d89e8 933 // Lookahead match is completed.
2ca993e8
A
934 int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule);
935 if (lookaheadResult >= 0) {
0f5d89e8
A
936 fRuleStatusIndex = row->fTagIdx;
937 fPosition = lookaheadResult;
2ca993e8 938 return lookaheadResult;
b75a7d8f 939 }
b75a7d8f 940 }
2ca993e8
A
941 int16_t rule = row->fLookAhead;
942 if (rule != 0) {
943 // At the position of a '/' in a look-ahead match. Record it.
0f5d89e8 944 int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
2ca993e8 945 lookAheadMatches.setPosition(rule, pos);
b75a7d8f
A
946 }
947
b75a7d8f 948 if (state == STOP_STATE) {
374ca955
A
949 // This is the normal exit from the lookup state machine.
950 // We have advanced through the string until it is certain that no
951 // longer match is possible, no matter what characters follow.
b75a7d8f
A
952 break;
953 }
0f5d89e8
A
954
955 // Advance to the next character.
73c04bcf
A
956 // If this is a beginning-of-input loop iteration, don't advance
957 // the input position. The next iteration will be processing the
958 // first real input character.
959 if (mode == RBBI_RUN) {
0f5d89e8 960 c = UTEXT_NEXT32(&fText);
73c04bcf
A
961 } else {
962 if (mode == RBBI_START) {
963 mode = RBBI_RUN;
964 }
965 }
b75a7d8f
A
966 }
967
374ca955 968 // The state machine is done. Check whether it found a match...
b75a7d8f 969
374ca955
A
970 // If the iterator failed to advance in the match engine, force it ahead by one.
971 // (This really indicates a defect in the break rules. They should always match
972 // at least one character.)
973 if (result == initialPosition) {
0f5d89e8
A
974 utext_setNativeIndex(&fText, initialPosition);
975 utext_next32(&fText);
976 result = (int32_t)utext_getNativeIndex(&fText);
977 fRuleStatusIndex = 0;
374ca955 978 }
b75a7d8f 979
374ca955 980 // Leave the iterator at our result position.
0f5d89e8 981 fPosition = result;
73c04bcf 982 #ifdef RBBI_DEBUG
0f5d89e8 983 if (gTrace) {
73c04bcf 984 RBBIDebugPrintf("result = %d\n\n", result);
b75a7d8f 985 }
73c04bcf 986 #endif
b75a7d8f
A
987 return result;
988}
989
990
374ca955
A
991//-----------------------------------------------------------------------------------
992//
0f5d89e8 993// handleSafePrevious()
374ca955 994//
0f5d89e8
A
995// Iterate backwards using the safe reverse rules.
996// The logic of this function is similar to handleNext(), but simpler
997// because the safe table does not require as many options.
374ca955
A
998//
999//-----------------------------------------------------------------------------------
0f5d89e8 1000int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
73c04bcf 1001 int32_t state;
4388f060 1002 uint16_t category = 0;
73c04bcf
A
1003 RBBIStateTableRow *row;
1004 UChar32 c;
73c04bcf 1005 int32_t result = 0;
73c04bcf 1006
0f5d89e8
A
1007 const RBBIStateTable *stateTable = fData->fReverseTable;
1008 UTEXT_SETNATIVEINDEX(&fText, fromPosition);
73c04bcf 1009 #ifdef RBBI_DEBUG
0f5d89e8 1010 if (gTrace) {
73c04bcf
A
1011 RBBIDebugPuts("Handle Previous pos char state category");
1012 }
1013 #endif
1014
73c04bcf 1015 // if we're already at the start of the text, return DONE.
0f5d89e8 1016 if (fData == NULL || UTEXT_GETNATIVEINDEX(&fText)==0) {
73c04bcf
A
1017 return BreakIterator::DONE;
1018 }
374ca955 1019
73c04bcf 1020 // Set the initial state for the state machine
0f5d89e8 1021 c = UTEXT_PREVIOUS32(&fText);
73c04bcf 1022 state = START_STATE;
374ca955 1023 row = (RBBIStateTableRow *)
0f5d89e8 1024 (stateTable->fTableData + (stateTable->fRowLen * state));
374ca955 1025
73c04bcf
A
1026 // loop until we reach the start of the text or transition to state 0
1027 //
0f5d89e8 1028 for (; c != U_SENTINEL; c = UTEXT_PREVIOUS32(&fText)) {
374ca955 1029
0f5d89e8
A
1030 // look up the current character's character category, which tells us
1031 // which column in the state table to look at.
1032 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
1033 // not the size of the character going in, which is a UChar32.
374ca955 1034 //
0f5d89e8
A
1035 // And off the dictionary flag bit. For reverse iteration it is not used.
1036 category = UTRIE2_GET16(fData->fTrie, c);
1037 category &= ~0x4000;
374ca955
A
1038
1039 #ifdef RBBI_DEBUG
0f5d89e8
A
1040 if (gTrace) {
1041 RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(&fText));
374ca955
A
1042 if (0x20<=c && c<0x7f) {
1043 RBBIDebugPrintf("\"%c\" ", c);
1044 } else {
1045 RBBIDebugPrintf("%5x ", c);
1046 }
1047 RBBIDebugPrintf("%3d %3d\n", state, category);
1048 }
1049 #endif
1050
73c04bcf
A
1051 // State Transition - move machine to its next state
1052 //
0f5d89e8 1053 // fNextState is a variable-length array.
4388f060
A
1054 U_ASSERT(category<fData->fHeader->fCatCount);
1055 state = row->fNextState[category]; /*Not accessing beyond memory*/
374ca955 1056 row = (RBBIStateTableRow *)
0f5d89e8 1057 (stateTable->fTableData + (stateTable->fRowLen * state));
374ca955 1058
374ca955 1059 if (state == STOP_STATE) {
73c04bcf 1060 // This is the normal exit from the lookup state machine.
0f5d89e8 1061 // Transistion to state zero means we have found a safe point.
374ca955
A
1062 break;
1063 }
374ca955
A
1064 }
1065
73c04bcf 1066 // The state machine is done. Check whether it found a match...
0f5d89e8 1067 result = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
73c04bcf 1068 #ifdef RBBI_DEBUG
0f5d89e8 1069 if (gTrace) {
73c04bcf
A
1070 RBBIDebugPrintf("result = %d\n\n", result);
1071 }
1072 #endif
374ca955
A
1073 return result;
1074}
1075
b75a7d8f
A
1076//-------------------------------------------------------------------------------
1077//
1078// getRuleStatus() Return the break rule tag associated with the current
1079// iterator position. If the iterator arrived at its current
1080// position by iterating forwards, the value will have been
1081// cached by the handleNext() function.
1082//
b75a7d8f 1083//-------------------------------------------------------------------------------
b75a7d8f 1084
374ca955 1085int32_t RuleBasedBreakIterator::getRuleStatus() const {
374ca955
A
1086
1087 // fLastRuleStatusIndex indexes to the start of the appropriate status record
1088 // (the number of status values.)
1089 // This function returns the last (largest) of the array of status values.
0f5d89e8 1090 int32_t idx = fRuleStatusIndex + fData->fRuleStatusTable[fRuleStatusIndex];
374ca955
A
1091 int32_t tagVal = fData->fRuleStatusTable[idx];
1092
1093 return tagVal;
1094}
1095
1096
374ca955 1097int32_t RuleBasedBreakIterator::getRuleStatusVec(
0f5d89e8 1098 int32_t *fillInVec, int32_t capacity, UErrorCode &status) {
374ca955
A
1099 if (U_FAILURE(status)) {
1100 return 0;
1101 }
1102
0f5d89e8 1103 int32_t numVals = fData->fRuleStatusTable[fRuleStatusIndex];
374ca955
A
1104 int32_t numValsToCopy = numVals;
1105 if (numVals > capacity) {
1106 status = U_BUFFER_OVERFLOW_ERROR;
1107 numValsToCopy = capacity;
1108 }
1109 int i;
1110 for (i=0; i<numValsToCopy; i++) {
0f5d89e8 1111 fillInVec[i] = fData->fRuleStatusTable[fRuleStatusIndex + i + 1];
374ca955
A
1112 }
1113 return numVals;
1114}
1115
0f5d89e8
A
1116// Apple custom addition
1117int32_t RuleBasedBreakIterator::tokenize(int32_t maxTokens, RuleBasedTokenRange *outTokenRanges, unsigned long *outTokenFlags)
1118{
1119 //os_log(OS_LOG_DEFAULT, "# tokenize 0: maxT %d; txt idx %lld, len %lld", maxTokens, utext_getNativeIndex(fText), utext_nativeLength(fText));
1120 if (fDone) {
1121 return 0;
1122 }
1123 RuleBasedTokenRange *outTokenLimit = outTokenRanges + maxTokens;
1124 RuleBasedTokenRange *outTokenP = outTokenRanges;
1125 int32_t lastOffset = fPosition;
1126 //os_log(OS_LOG_DEFAULT, "# tokenize 1");
1127 while (outTokenP < outTokenLimit) {
1128 // start portion from inlining populateFollowing()
1129 int32_t pos = 0;
1130 int32_t ruleStatusIdx = 0;
1131 int32_t startPos = fPosition;
1132
1133 if (fDictionaryCache->following(startPos, &pos, &ruleStatusIdx)) {
1134 fPosition = pos;
1135 fRuleStatusIndex = ruleStatusIdx;
1136 } else {
1137 pos = handleNextInternal(); // sets fRuleStatusIndex for the pos it returns, updates fPosition
1138 if (pos == UBRK_DONE) {
1139 // fDone = TRUE; already set by handleNextInternal
1140 break;
1141 }
1142 // Use current result from handleNextInternal(), including fRuleStatusIndex,
1143 // unless overridden by dictionary subdivisions
1144 fPosition = pos;
1145 if (fDictionaryCharCount > 0) {
1146 // The text segment obtained from the rules includes dictionary characters.
1147 // Subdivide it, with subdivided results going into the dictionary cache.
1148 fDictionaryCache->populateDictionary(startPos, pos, fRuleStatusIndex, fRuleStatusIndex);
1149 if (fDictionaryCache->following(startPos, &pos, &ruleStatusIdx)) {
1150 fPosition = pos;
1151 fRuleStatusIndex = ruleStatusIdx;
1152 }
1153 }
1154 }
1155 // end portion from inlining populateFollowing()
1156 int32_t flagCount = fData->fRuleStatusTable[fRuleStatusIndex];
1157 const int32_t* flagPtr = fData->fRuleStatusTable + fRuleStatusIndex + flagCount;
1158 int32_t flagSet = *flagPtr; // if -1 then skip token
1159 if (flagSet != -1) {
1160 outTokenP->location = lastOffset;
1161 outTokenP++->length = fPosition - lastOffset;
1162 if (outTokenFlags) {
1163 // flagSet should be the OR of all flags returned by getRuleStatusVec;
1164 // here we collect from high-order to low-order.
1165 while (--flagCount > 0) {
1166 flagSet |= *--flagPtr;
1167 }
1168 *outTokenFlags++ = (unsigned long)flagSet;
1169 }
1170 }
1171 lastOffset = fPosition;
1172 }
1173 return (outTokenP - outTokenRanges);
1174}
374ca955 1175
b75a7d8f
A
1176//-------------------------------------------------------------------------------
1177//
1178// getBinaryRules Access to the compiled form of the rules,
1179// for use by build system tools that save the data
1180// for standard iterator types.
1181//
1182//-------------------------------------------------------------------------------
1183const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
1184 const uint8_t *retPtr = NULL;
1185 length = 0;
1186
1187 if (fData != NULL) {
1188 retPtr = (const uint8_t *)fData->fHeader;
1189 length = fData->fHeader->fLength;
1190 }
1191 return retPtr;
1192}
1193
1194
57a6839d 1195BreakIterator * RuleBasedBreakIterator::createBufferClone(void * /*stackBuffer*/,
b75a7d8f
A
1196 int32_t &bufferSize,
1197 UErrorCode &status)
1198{
1199 if (U_FAILURE(status)){
1200 return NULL;
1201 }
1202
b75a7d8f 1203 if (bufferSize == 0) {
57a6839d 1204 bufferSize = 1; // preflighting for deprecated functionality
b75a7d8f
A
1205 return NULL;
1206 }
1207
57a6839d
A
1208 BreakIterator *clonedBI = clone();
1209 if (clonedBI == NULL) {
1210 status = U_MEMORY_ALLOCATION_ERROR;
1211 } else {
1212 status = U_SAFECLONE_ALLOCATED_WARNING;
b75a7d8f 1213 }
57a6839d 1214 return (RuleBasedBreakIterator *)clonedBI;
b75a7d8f
A
1215}
1216
73c04bcf
A
1217U_NAMESPACE_END
1218
73c04bcf 1219
0f5d89e8
A
1220static icu::UStack *gLanguageBreakFactories = nullptr;
1221static const icu::UnicodeString *gEmptyString = nullptr;
57a6839d 1222static icu::UInitOnce gLanguageBreakFactoriesInitOnce = U_INITONCE_INITIALIZER;
0f5d89e8 1223static icu::UInitOnce gRBBIInitOnce = U_INITONCE_INITIALIZER;
46f4442e 1224
73c04bcf 1225/**
0f5d89e8 1226 * Release all static memory held by breakiterator.
73c04bcf
A
1227 */
1228U_CDECL_BEGIN
0f5d89e8
A
1229static UBool U_CALLCONV rbbi_cleanup(void) {
1230 delete gLanguageBreakFactories;
1231 gLanguageBreakFactories = nullptr;
1232 delete gEmptyString;
1233 gEmptyString = nullptr;
57a6839d 1234 gLanguageBreakFactoriesInitOnce.reset();
0f5d89e8 1235 gRBBIInitOnce.reset();
73c04bcf 1236 return TRUE;
b75a7d8f 1237}
73c04bcf 1238U_CDECL_END
b75a7d8f 1239
73c04bcf
A
1240U_CDECL_BEGIN
1241static void U_CALLCONV _deleteFactory(void *obj) {
4388f060 1242 delete (icu::LanguageBreakFactory *) obj;
73c04bcf
A
1243}
1244U_CDECL_END
1245U_NAMESPACE_BEGIN
b75a7d8f 1246
0f5d89e8
A
1247static void U_CALLCONV rbbiInit() {
1248 gEmptyString = new UnicodeString();
1249 ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup);
1250}
1251
57a6839d
A
1252static void U_CALLCONV initLanguageFactories() {
1253 UErrorCode status = U_ZERO_ERROR;
1254 U_ASSERT(gLanguageBreakFactories == NULL);
1255 gLanguageBreakFactories = new UStack(_deleteFactory, NULL, status);
1256 if (gLanguageBreakFactories != NULL && U_SUCCESS(status)) {
1257 ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
1258 gLanguageBreakFactories->push(builtIn, status);
73c04bcf 1259#ifdef U_LOCAL_SERVICE_HOOK
57a6839d
A
1260 LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
1261 if (extra != NULL) {
1262 gLanguageBreakFactories->push(extra, status);
73c04bcf 1263 }
57a6839d 1264#endif
73c04bcf 1265 }
0f5d89e8 1266 ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup);
57a6839d
A
1267}
1268
1269
1270static const LanguageBreakEngine*
0f5d89e8 1271getLanguageBreakEngineFromFactory(UChar32 c)
57a6839d
A
1272{
1273 umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories);
73c04bcf
A
1274 if (gLanguageBreakFactories == NULL) {
1275 return NULL;
1276 }
0f5d89e8 1277
73c04bcf
A
1278 int32_t i = gLanguageBreakFactories->size();
1279 const LanguageBreakEngine *lbe = NULL;
1280 while (--i >= 0) {
1281 LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
0f5d89e8 1282 lbe = factory->getEngineFor(c);
73c04bcf
A
1283 if (lbe != NULL) {
1284 break;
1285 }
1286 }
1287 return lbe;
1288}
1289
1290
1291//-------------------------------------------------------------------------------
1292//
1293// getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the
51004dcb 1294// the character c.
73c04bcf
A
1295//
1296//-------------------------------------------------------------------------------
1297const LanguageBreakEngine *
1298RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
1299 const LanguageBreakEngine *lbe = NULL;
1300 UErrorCode status = U_ZERO_ERROR;
0f5d89e8 1301
73c04bcf
A
1302 if (fLanguageBreakEngines == NULL) {
1303 fLanguageBreakEngines = new UStack(status);
46f4442e 1304 if (fLanguageBreakEngines == NULL || U_FAILURE(status)) {
73c04bcf
A
1305 delete fLanguageBreakEngines;
1306 fLanguageBreakEngines = 0;
1307 return NULL;
1308 }
1309 }
0f5d89e8 1310
73c04bcf
A
1311 int32_t i = fLanguageBreakEngines->size();
1312 while (--i >= 0) {
1313 lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
0f5d89e8 1314 if (lbe->handles(c)) {
73c04bcf
A
1315 return lbe;
1316 }
1317 }
0f5d89e8 1318
73c04bcf
A
1319 // No existing dictionary took the character. See if a factory wants to
1320 // give us a new LanguageBreakEngine for this character.
0f5d89e8
A
1321 lbe = getLanguageBreakEngineFromFactory(c);
1322
73c04bcf
A
1323 // If we got one, use it and push it on our stack.
1324 if (lbe != NULL) {
1325 fLanguageBreakEngines->push((void *)lbe, status);
1326 // Even if we can't remember it, we can keep looking it up, so
1327 // return it even if the push fails.
1328 return lbe;
1329 }
0f5d89e8 1330
73c04bcf
A
1331 // No engine is forthcoming for this character. Add it to the
1332 // reject set. Create the reject break engine if needed.
1333 if (fUnhandledBreakEngine == NULL) {
1334 fUnhandledBreakEngine = new UnhandledEngine(status);
1335 if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) {
1336 status = U_MEMORY_ALLOCATION_ERROR;
0f5d89e8 1337 return nullptr;
73c04bcf
A
1338 }
1339 // Put it last so that scripts for which we have an engine get tried
1340 // first.
1341 fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
1342 // If we can't insert it, or creation failed, get rid of it
1343 if (U_FAILURE(status)) {
1344 delete fUnhandledBreakEngine;
1345 fUnhandledBreakEngine = 0;
1346 return NULL;
1347 }
1348 }
0f5d89e8 1349
73c04bcf
A
1350 // Tell the reject engine about the character; at its discretion, it may
1351 // add more than just the one character.
0f5d89e8
A
1352 fUnhandledBreakEngine->handleCharacter(c);
1353
73c04bcf
A
1354 return fUnhandledBreakEngine;
1355}
1356
0f5d89e8
A
1357void RuleBasedBreakIterator::dumpCache() {
1358 fBreakCache->dumpCache();
1359}
73c04bcf 1360
0f5d89e8
A
1361void RuleBasedBreakIterator::dumpTables() {
1362 fData->printData();
1363}
73c04bcf 1364
0f5d89e8
A
1365/**
1366 * Returns the description used to create this iterator
1367 */
73c04bcf 1368
0f5d89e8
A
1369const UnicodeString&
1370RuleBasedBreakIterator::getRules() const {
1371 if (fData != NULL) {
1372 return fData->getRuleSourceString();
1373 } else {
1374 umtx_initOnce(gRBBIInitOnce, &rbbiInit);
1375 return *gEmptyString;
1376 }
73c04bcf 1377}
b75a7d8f
A
1378
1379U_NAMESPACE_END
1380
1381#endif /* #if !UCONFIG_NO_BREAK_ITERATION */