]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/rbbi.cpp
ICU-64252.0.1.tar.gz
[apple/icu.git] / icuSources / common / rbbi.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
b75a7d8f
A
3/*
4***************************************************************************
2ca993e8 5* Copyright (C) 1999-2016 International Business Machines Corporation
729e4ab9 6* and others. All rights reserved.
b75a7d8f
A
7***************************************************************************
8*/
374ca955 9//
0f5d89e8 10// file: rbbi.cpp Contains the implementation of the rule based break iterator
374ca955
A
11// runtime engine and the API implementation for
12// class RuleBasedBreakIterator
13//
b75a7d8f 14
51004dcb 15#include "utypeinfo.h" // for 'typeid' to work
729e4ab9 16
b75a7d8f
A
17#include "unicode/utypes.h"
18
19#if !UCONFIG_NO_BREAK_ITERATION
20
3d1f044b
A
21#include <cinttypes>
22
b75a7d8f
A
23#include "unicode/rbbi.h"
24#include "unicode/schriter.h"
73c04bcf 25#include "unicode/uchriter.h"
374ca955 26#include "unicode/uclean.h"
0f5d89e8
A
27#include "unicode/udata.h"
28
29#include "brkeng.h"
30#include "ucln_cmn.h"
b75a7d8f
A
31#include "cmemory.h"
32#include "cstring.h"
3d1f044b 33#include "localsvc.h"
0f5d89e8
A
34#include "rbbidata.h"
35#include "rbbi_cache.h"
36#include "rbbirb.h"
b75a7d8f 37#include "uassert.h"
0f5d89e8
A
38#include "umutex.h"
39#include "uvectr32.h"
73c04bcf 40
73c04bcf 41#ifdef RBBI_DEBUG
0f5d89e8 42static UBool gTrace = FALSE;
73c04bcf 43#endif
b75a7d8f
A
44
45U_NAMESPACE_BEGIN
46
46f4442e 47// The state number of the starting state
0f5d89e8 48constexpr int32_t START_STATE = 1;
b75a7d8f 49
46f4442e 50// The state-transition value indicating "stop"
0f5d89e8 51constexpr int32_t STOP_STATE = 0;
b75a7d8f 52
374ca955
A
53
54UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
b75a7d8f
A
55
56
57//=======================================================================
58// constructors
59//=======================================================================
60
61/**
62 * Constructs a RuleBasedBreakIterator that uses the already-created
63 * tables object that is passed in as a parameter.
64 */
65RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
0f5d89e8 66 : fSCharIter(UnicodeString())
b75a7d8f 67{
0f5d89e8 68 init(status);
374ca955 69 fData = new RBBIDataWrapper(data, status); // status checked in constructor
b75a7d8f 70 if (U_FAILURE(status)) {return;}
b75a7d8f
A
71 if(fData == 0) {
72 status = U_MEMORY_ALLOCATION_ERROR;
73 return;
74 }
75}
76
4388f060
A
77//
78// Construct from precompiled binary rules (tables). This constructor is public API,
79// taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules().
80//
81RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
82 uint32_t ruleLength,
0f5d89e8
A
83 UErrorCode &status)
84 : fSCharIter(UnicodeString())
85{
86 init(status);
4388f060
A
87 if (U_FAILURE(status)) {
88 return;
89 }
90 if (compiledRules == NULL || ruleLength < sizeof(RBBIDataHeader)) {
91 status = U_ILLEGAL_ARGUMENT_ERROR;
92 return;
93 }
94 const RBBIDataHeader *data = (const RBBIDataHeader *)compiledRules;
95 if (data->fLength > ruleLength) {
96 status = U_ILLEGAL_ARGUMENT_ERROR;
97 return;
98 }
0f5d89e8 99 fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status);
4388f060
A
100 if (U_FAILURE(status)) {return;}
101 if(fData == 0) {
102 status = U_MEMORY_ALLOCATION_ERROR;
103 return;
104 }
0f5d89e8 105}
4388f060
A
106
107
b75a7d8f
A
108//-------------------------------------------------------------------------------
109//
110// Constructor from a UDataMemory handle to precompiled break rules
111// stored in an ICU data file.
112//
113//-------------------------------------------------------------------------------
114RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
0f5d89e8 115 : fSCharIter(UnicodeString())
b75a7d8f 116{
0f5d89e8 117 init(status);
374ca955 118 fData = new RBBIDataWrapper(udm, status); // status checked in constructor
b75a7d8f 119 if (U_FAILURE(status)) {return;}
b75a7d8f
A
120 if(fData == 0) {
121 status = U_MEMORY_ALLOCATION_ERROR;
122 return;
123 }
124}
125
126
127
128//-------------------------------------------------------------------------------
129//
130// Constructor from a set of rules supplied as a string.
131//
132//-------------------------------------------------------------------------------
133RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules,
134 UParseError &parseError,
135 UErrorCode &status)
0f5d89e8 136 : fSCharIter(UnicodeString())
b75a7d8f 137{
0f5d89e8 138 init(status);
b75a7d8f
A
139 if (U_FAILURE(status)) {return;}
140 RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
46f4442e 141 RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status);
b75a7d8f
A
142 // Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that
143 // creates and returns a complete RBBI. From here, in a constructor, we
144 // can't just return the object created by the builder factory, hence
145 // the assignment of the factory created object to "this".
146 if (U_SUCCESS(status)) {
147 *this = *bi;
148 delete bi;
149 }
150}
151
152
153//-------------------------------------------------------------------------------
154//
155// Default Constructor. Create an empty shell that can be set up later.
156// Used when creating a RuleBasedBreakIterator from a set
157// of rules.
158//-------------------------------------------------------------------------------
0f5d89e8
A
159RuleBasedBreakIterator::RuleBasedBreakIterator()
160 : fSCharIter(UnicodeString())
161{
162 UErrorCode status = U_ZERO_ERROR;
163 init(status);
b75a7d8f
A
164}
165
166
167//-------------------------------------------------------------------------------
168//
169// Copy constructor. Will produce a break iterator with the same behavior,
170// and which iterates over the same text, as the one passed in.
171//
172//-------------------------------------------------------------------------------
173RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
0f5d89e8
A
174: BreakIterator(other),
175 fSCharIter(UnicodeString())
b75a7d8f 176{
0f5d89e8
A
177 UErrorCode status = U_ZERO_ERROR;
178 this->init(status);
b75a7d8f
A
179 *this = other;
180}
181
182
183/**
184 * Destructor
185 */
186RuleBasedBreakIterator::~RuleBasedBreakIterator() {
0f5d89e8 187 if (fCharIter != &fSCharIter) {
73c04bcf
A
188 // fCharIter was adopted from the outside.
189 delete fCharIter;
190 }
191 fCharIter = NULL;
0f5d89e8
A
192
193 utext_close(&fText);
73c04bcf 194
b75a7d8f
A
195 if (fData != NULL) {
196 fData->removeReference();
197 fData = NULL;
198 }
0f5d89e8
A
199 delete fBreakCache;
200 fBreakCache = NULL;
201
202 delete fDictionaryCache;
203 fDictionaryCache = NULL;
204
205 delete fLanguageBreakEngines;
206 fLanguageBreakEngines = NULL;
207
208 delete fUnhandledBreakEngine;
209 fUnhandledBreakEngine = NULL;
210
211 delete [] fLatin1Cat;
212 fLatin1Cat = NULL;
b75a7d8f
A
213}
214
215/**
216 * Assignment operator. Sets this iterator to have the same behavior,
217 * and iterate over the same text, as the one passed in.
218 */
219RuleBasedBreakIterator&
220RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
221 if (this == &that) {
222 return *this;
223 }
0f5d89e8
A
224 BreakIterator::operator=(that);
225 fLineWordOpts = that.fLineWordOpts;
226
73c04bcf
A
227 if (fLanguageBreakEngines != NULL) {
228 delete fLanguageBreakEngines;
229 fLanguageBreakEngines = NULL; // Just rebuild for now
230 }
231 // TODO: clone fLanguageBreakEngines from "that"
232 UErrorCode status = U_ZERO_ERROR;
0f5d89e8 233 utext_clone(&fText, &that.fText, FALSE, TRUE, &status);
73c04bcf 234
0f5d89e8 235 if (fCharIter != &fSCharIter) {
73c04bcf
A
236 delete fCharIter;
237 }
0f5d89e8 238 fCharIter = &fSCharIter;
73c04bcf 239
0f5d89e8 240 if (that.fCharIter != NULL && that.fCharIter != &that.fSCharIter) {
73c04bcf
A
241 // This is a little bit tricky - it will intially appear that
242 // this->fCharIter is adopted, even if that->fCharIter was
243 // not adopted. That's ok.
244 fCharIter = that.fCharIter->clone();
b75a7d8f 245 }
0f5d89e8
A
246 fSCharIter = that.fSCharIter;
247 if (fCharIter == NULL) {
248 fCharIter = &fSCharIter;
249 }
b75a7d8f
A
250
251 if (fData != NULL) {
252 fData->removeReference();
253 fData = NULL;
254 }
255 if (that.fData != NULL) {
256 fData = that.fData->addReference();
257 }
b75a7d8f 258
0f5d89e8
A
259 delete [] fLatin1Cat;
260 fLatin1Cat = NULL;
261
262 fPosition = that.fPosition;
263 fRuleStatusIndex = that.fRuleStatusIndex;
264 fDone = that.fDone;
265
266 // TODO: both the dictionary and the main cache need to be copied.
267 // Current position could be within a dictionary range. Trying to continue
268 // the iteration without the caches present would go to the rules, with
269 // the assumption that the current position is on a rule boundary.
270 fBreakCache->reset(fPosition, fRuleStatusIndex);
271 fDictionaryCache->reset();
272
b75a7d8f
A
273 return *this;
274}
275
276
277
278//-----------------------------------------------------------------------------
279//
280// init() Shared initialization routine. Used by all the constructors.
281// Initializes all fields, leaving the object in a consistent state.
282//
283//-----------------------------------------------------------------------------
0f5d89e8 284void RuleBasedBreakIterator::init(UErrorCode &status) {
73c04bcf 285 fCharIter = NULL;
374ca955 286 fData = NULL;
0f5d89e8
A
287 fLatin1Cat = NULL;
288 fPosition = 0;
289 fRuleStatusIndex = 0;
290 fDone = false;
374ca955 291 fDictionaryCharCount = 0;
0f5d89e8
A
292 fLanguageBreakEngines = NULL;
293 fUnhandledBreakEngine = NULL;
294 fBreakCache = NULL;
295 fDictionaryCache = NULL;
73c04bcf 296
0f5d89e8
A
297 // Note: IBM xlC is unable to assign or initialize member fText from UTEXT_INITIALIZER.
298 // fText = UTEXT_INITIALIZER;
299 static const UText initializedUText = UTEXT_INITIALIZER;
300 uprv_memcpy(&fText, &initializedUText, sizeof(UText));
301
302 if (U_FAILURE(status)) {
303 return;
304 }
305
306 utext_openUChars(&fText, NULL, 0, &status);
307 fDictionaryCache = new DictionaryCache(this, status);
308 fBreakCache = new BreakCache(this, status);
309 if (U_SUCCESS(status) && (fDictionaryCache == NULL || fBreakCache == NULL)) {
310 status = U_MEMORY_ALLOCATION_ERROR;
311 }
b75a7d8f
A
312
313#ifdef RBBI_DEBUG
314 static UBool debugInitDone = FALSE;
315 if (debugInitDone == FALSE) {
316 char *debugEnv = getenv("U_RBBIDEBUG");
317 if (debugEnv && uprv_strstr(debugEnv, "trace")) {
0f5d89e8 318 gTrace = TRUE;
b75a7d8f
A
319 }
320 debugInitDone = TRUE;
321 }
322#endif
323}
324
325
0f5d89e8
A
326void RuleBasedBreakIterator::initLatin1Cat(void) {
327 fLatin1Cat = new uint16_t[256];
328 for (UChar32 c = 0; c < 256; ++c) {
329 fLatin1Cat[c] = UTRIE2_GET16(fData->fTrie, c);
330 }
331}
b75a7d8f
A
332
333//-----------------------------------------------------------------------------
334//
335// clone - Returns a newly-constructed RuleBasedBreakIterator with the same
336// behavior, and iterating over the same text, as this one.
337// Virtual function: does the right thing with subclasses.
338//
339//-----------------------------------------------------------------------------
340BreakIterator*
341RuleBasedBreakIterator::clone(void) const {
342 return new RuleBasedBreakIterator(*this);
343}
344
345/**
346 * Equality operator. Returns TRUE if both BreakIterators are of the
347 * same class, have the same behavior, and iterate over the same text.
348 */
349UBool
350RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
729e4ab9 351 if (typeid(*this) != typeid(that)) {
73c04bcf 352 return FALSE;
b75a7d8f 353 }
0f5d89e8
A
354 if (this == &that) {
355 return TRUE;
356 }
357
358 // The base class BreakIterator carries no state that participates in equality,
359 // and does not implement an equality function that would otherwise be
360 // checked at this point.
b75a7d8f
A
361
362 const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;
0f5d89e8 363 if (that2.fLineWordOpts != fLineWordOpts) {
2ca993e8
A
364 return FALSE;
365 }
73c04bcf 366
0f5d89e8 367 if (!utext_equals(&fText, &that2.fText)) {
73c04bcf 368 // The two break iterators are operating on different text,
0f5d89e8
A
369 // or have a different iteration position.
370 // Note that fText's position is always the same as the break iterator's position.
73c04bcf
A
371 return FALSE;
372 };
373
0f5d89e8
A
374 if (!(fPosition == that2.fPosition &&
375 fRuleStatusIndex == that2.fRuleStatusIndex &&
376 fDone == that2.fDone)) {
377 return FALSE;
378 }
73c04bcf
A
379
380 if (that2.fData == fData ||
381 (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) {
382 // The two break iterators are using the same rules.
383 return TRUE;
b75a7d8f 384 }
73c04bcf 385 return FALSE;
b75a7d8f
A
386}
387
388/**
389 * Compute a hash code for this BreakIterator
390 * @return A hash code
391 */
392int32_t
393RuleBasedBreakIterator::hashCode(void) const {
394 int32_t hash = 0;
395 if (fData != NULL) {
396 hash = fData->hashCode();
397 }
398 return hash;
399}
400
73c04bcf
A
401
402void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
403 if (U_FAILURE(status)) {
404 return;
405 }
0f5d89e8
A
406 fBreakCache->reset();
407 fDictionaryCache->reset();
408 utext_clone(&fText, ut, FALSE, TRUE, &status);
73c04bcf
A
409
410 // Set up a dummy CharacterIterator to be returned if anyone
411 // calls getText(). With input from UText, there is no reasonable
412 // way to return a characterIterator over the actual input text.
413 // Return one over an empty string instead - this is the closest
414 // we can come to signaling a failure.
415 // (GetText() is obsolete, this failure is sort of OK)
0f5d89e8 416 fSCharIter.setText(UnicodeString());
73c04bcf 417
0f5d89e8 418 if (fCharIter != &fSCharIter) {
73c04bcf
A
419 // existing fCharIter was adopted from the outside. Delete it now.
420 delete fCharIter;
421 }
0f5d89e8 422 fCharIter = &fSCharIter;
73c04bcf
A
423
424 this->first();
425}
426
427
428UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {
0f5d89e8 429 UText *result = utext_clone(fillIn, &fText, FALSE, TRUE, &status);
73c04bcf
A
430 return result;
431}
432
433
b75a7d8f
A
434//=======================================================================
435// BreakIterator overrides
436//=======================================================================
437
438/**
0f5d89e8 439 * Return a CharacterIterator over the text being analyzed.
b75a7d8f 440 */
73c04bcf 441CharacterIterator&
b75a7d8f 442RuleBasedBreakIterator::getText() const {
73c04bcf 443 return *fCharIter;
b75a7d8f
A
444}
445
446/**
447 * Set the iterator to analyze a new piece of text. This function resets
448 * the current iteration position to the beginning of the text.
449 * @param newText An iterator over the text to analyze.
450 */
451void
452RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
0f5d89e8 453 // If we are holding a CharacterIterator adopted from a
73c04bcf 454 // previous call to this function, delete it now.
0f5d89e8 455 if (fCharIter != &fSCharIter) {
73c04bcf
A
456 delete fCharIter;
457 }
458
459 fCharIter = newText;
460 UErrorCode status = U_ZERO_ERROR;
0f5d89e8
A
461 fBreakCache->reset();
462 fDictionaryCache->reset();
463 if (newText==NULL || newText->startIndex() != 0) {
73c04bcf
A
464 // startIndex !=0 wants to be an error, but there's no way to report it.
465 // Make the iterator text be an empty string.
0f5d89e8 466 utext_openUChars(&fText, NULL, 0, &status);
73c04bcf 467 } else {
0f5d89e8 468 utext_openCharacterIterator(&fText, newText, &status);
73c04bcf 469 }
b75a7d8f
A
470 this->first();
471}
472
473/**
474 * Set the iterator to analyze a new piece of text. This function resets
475 * the current iteration position to the beginning of the text.
476 * @param newText An iterator over the text to analyze.
477 */
478void
479RuleBasedBreakIterator::setText(const UnicodeString& newText) {
73c04bcf 480 UErrorCode status = U_ZERO_ERROR;
0f5d89e8
A
481 fBreakCache->reset();
482 fDictionaryCache->reset();
483 utext_openConstUnicodeString(&fText, &newText, &status);
73c04bcf 484
0f5d89e8 485 // Set up a character iterator on the string.
73c04bcf
A
486 // Needed in case someone calls getText().
487 // Can not, unfortunately, do this lazily on the (probably never)
488 // call to getText(), because getText is const.
0f5d89e8 489 fSCharIter.setText(newText);
73c04bcf 490
0f5d89e8 491 if (fCharIter != &fSCharIter) {
73c04bcf
A
492 // old fCharIter was adopted from the outside. Delete it.
493 delete fCharIter;
b75a7d8f 494 }
0f5d89e8 495 fCharIter = &fSCharIter;
73c04bcf 496
b75a7d8f
A
497 this->first();
498}
499
500
4388f060
A
501/**
502 * Provide a new UText for the input text. Must reference text with contents identical
503 * to the original.
504 * Intended for use with text data originating in Java (garbage collected) environments
505 * where the data may be moved in memory at arbitrary times.
506 */
507RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) {
508 if (U_FAILURE(status)) {
509 return *this;
510 }
511 if (input == NULL) {
512 status = U_ILLEGAL_ARGUMENT_ERROR;
513 return *this;
514 }
0f5d89e8 515 int64_t pos = utext_getNativeIndex(&fText);
4388f060 516 // Shallow read-only clone of the new UText into the existing input UText
0f5d89e8 517 utext_clone(&fText, input, FALSE, TRUE, &status);
4388f060
A
518 if (U_FAILURE(status)) {
519 return *this;
520 }
0f5d89e8
A
521 utext_setNativeIndex(&fText, pos);
522 if (utext_getNativeIndex(&fText) != pos) {
4388f060
A
523 // Sanity check. The new input utext is supposed to have the exact same
524 // contents as the old. If we can't set to the same position, it doesn't.
525 // The contents underlying the old utext might be invalid at this point,
526 // so it's not safe to check directly.
527 status = U_ILLEGAL_ARGUMENT_ERROR;
528 }
529 return *this;
530}
531
b75a7d8f
A
532
533/**
b331163b
A
534 * Sets the current iteration position to the beginning of the text, position zero.
535 * @return The new iterator position, which is zero.
b75a7d8f
A
536 */
537int32_t RuleBasedBreakIterator::first(void) {
0f5d89e8
A
538 UErrorCode status = U_ZERO_ERROR;
539 if (!fBreakCache->seek(0)) {
540 fBreakCache->populateNear(0, status);
541 }
542 fBreakCache->current();
543 U_ASSERT(fPosition == 0);
73c04bcf 544 return 0;
b75a7d8f
A
545}
546
547/**
548 * Sets the current iteration position to the end of the text.
b75a7d8f
A
549 * @return The text's past-the-end offset.
550 */
551int32_t RuleBasedBreakIterator::last(void) {
0f5d89e8
A
552 int32_t endPos = (int32_t)utext_nativeLength(&fText);
553 UBool endShouldBeBoundary = isBoundary(endPos); // Has side effect of setting iterator position.
554 (void)endShouldBeBoundary;
555 U_ASSERT(endShouldBeBoundary);
556 U_ASSERT(fPosition == endPos);
557 return endPos;
b75a7d8f
A
558}
559
560/**
561 * Advances the iterator either forward or backward the specified number of steps.
562 * Negative values move backward, and positive values move forward. This is
563 * equivalent to repeatedly calling next() or previous().
564 * @param n The number of steps to move. The sign indicates the direction
565 * (negative is backwards, and positive is forwards).
566 * @return The character offset of the boundary position n boundaries away from
567 * the current one.
568 */
569int32_t RuleBasedBreakIterator::next(int32_t n) {
0f5d89e8
A
570 int32_t result = 0;
571 if (n > 0) {
572 for (; n > 0 && result != UBRK_DONE; --n) {
573 result = next();
574 }
575 } else if (n < 0) {
576 for (; n < 0 && result != UBRK_DONE; ++n) {
577 result = previous();
578 }
579 } else {
580 result = current();
b75a7d8f
A
581 }
582 return result;
583}
584
585/**
586 * Advances the iterator to the next boundary position.
587 * @return The position of the first boundary after this one.
588 */
589int32_t RuleBasedBreakIterator::next(void) {
0f5d89e8
A
590 fBreakCache->next();
591 return fDone ? UBRK_DONE : fPosition;
b75a7d8f
A
592}
593
594/**
0f5d89e8
A
595 * Move the iterator backwards, to the boundary preceding the current one.
596 *
597 * Starts from the current position within fText.
598 * Starting position need not be on a boundary.
599 *
600 * @return The position of the boundary position immediately preceding the starting position.
b75a7d8f
A
601 */
602int32_t RuleBasedBreakIterator::previous(void) {
0f5d89e8
A
603 UErrorCode status = U_ZERO_ERROR;
604 fBreakCache->previous(status);
605 return fDone ? UBRK_DONE : fPosition;
b75a7d8f
A
606}
607
b75a7d8f
A
608/**
609 * Sets the iterator to refer to the first boundary position following
610 * the specified position.
0f5d89e8 611 * @param startPos The position from which to begin searching for a break position.
b75a7d8f
A
612 * @return The position of the first break after the current position.
613 */
0f5d89e8
A
614int32_t RuleBasedBreakIterator::following(int32_t startPos) {
615 // if the supplied position is before the beginning, return the
b331163b 616 // text's starting offset
0f5d89e8 617 if (startPos < 0) {
b331163b
A
618 return first();
619 }
620
621 // Move requested offset to a code point start. It might be on a trail surrogate,
0f5d89e8
A
622 // or on a trail byte if the input is UTF-8. Or it may be beyond the end of the text.
623 utext_setNativeIndex(&fText, startPos);
624 startPos = (int32_t)utext_getNativeIndex(&fText);
b75a7d8f 625
0f5d89e8
A
626 UErrorCode status = U_ZERO_ERROR;
627 fBreakCache->following(startPos, status);
628 return fDone ? UBRK_DONE : fPosition;
b75a7d8f
A
629}
630
631/**
632 * Sets the iterator to refer to the last boundary position before the
633 * specified position.
0f5d89e8 634 * @param offset The position to begin searching for a break from.
b75a7d8f
A
635 * @return The position of the last boundary before the starting position.
636 */
637int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
0f5d89e8 638 if (offset > utext_nativeLength(&fText)) {
b331163b
A
639 return last();
640 }
b331163b
A
641
642 // Move requested offset to a code point start. It might be on a trail surrogate,
643 // or on a trail byte if the input is UTF-8.
73c04bcf 644
0f5d89e8 645 utext_setNativeIndex(&fText, offset);
3d1f044b 646 int32_t adjustedOffset = static_cast<int32_t>(utext_getNativeIndex(&fText));
374ca955 647
0f5d89e8
A
648 UErrorCode status = U_ZERO_ERROR;
649 fBreakCache->preceding(adjustedOffset, status);
650 return fDone ? UBRK_DONE : fPosition;
b75a7d8f
A
651}
652
653/**
654 * Returns true if the specfied position is a boundary position. As a side
655 * effect, leaves the iterator pointing to the first boundary position at
656 * or after "offset".
0f5d89e8 657 *
b75a7d8f
A
658 * @param offset the offset to check.
659 * @return True if "offset" is a boundary position.
660 */
661UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
b75a7d8f 662 // out-of-range indexes are never boundary positions
73c04bcf 663 if (offset < 0) {
b75a7d8f
A
664 first(); // For side effects on current position, tag values.
665 return FALSE;
666 }
667
0f5d89e8
A
668 // Adjust offset to be on a code point boundary and not beyond the end of the text.
669 // Note that isBoundary() is always false for offsets that are not on code point boundaries.
670 // But we still need the side effect of leaving iteration at the following boundary.
671
672 utext_setNativeIndex(&fText, offset);
3d1f044b 673 int32_t adjustedOffset = static_cast<int32_t>(utext_getNativeIndex(&fText));
0f5d89e8
A
674
675 bool result = false;
676 UErrorCode status = U_ZERO_ERROR;
677 if (fBreakCache->seek(adjustedOffset) || fBreakCache->populateNear(adjustedOffset, status)) {
678 result = (fBreakCache->current() == offset);
b75a7d8f
A
679 }
680
0f5d89e8
A
681 if (result && adjustedOffset < offset && utext_char32At(&fText, offset) == U_SENTINEL) {
682 // Original offset is beyond the end of the text. Return FALSE, it's not a boundary,
683 // but the iteration position remains set to the end of the text, which is a boundary.
684 return FALSE;
685 }
686 if (!result) {
687 // Not on a boundary. isBoundary() must leave iterator on the following boundary.
688 // Cache->seek(), above, left us on the preceding boundary, so advance one.
689 next();
690 }
73c04bcf 691 return result;
b75a7d8f
A
692}
693
0f5d89e8 694
b75a7d8f
A
695/**
696 * Returns the current iteration position.
697 * @return The current iteration position.
698 */
699int32_t RuleBasedBreakIterator::current(void) const {
0f5d89e8 700 return fPosition;
b75a7d8f 701}
0f5d89e8
A
702
703
b75a7d8f
A
704//=======================================================================
705// implementation
706//=======================================================================
707
73c04bcf
A
708//
709// RBBIRunMode - the state machine runs an extra iteration at the beginning and end
710// of user text. A variable with this enum type keeps track of where we
711// are. The state machine only fetches user input while in the RUN mode.
712//
713enum RBBIRunMode {
714 RBBI_START, // state machine processing is before first char of input
715 RBBI_RUN, // state machine processing is in the user text
716 RBBI_END // state machine processing is after end of user text.
717};
718
b75a7d8f 719
2ca993e8
A
720// Map from look-ahead break states (corresponds to rules) to boundary positions.
721// Allows multiple lookahead break rules to be in flight at the same time.
722//
723// This is a temporary approach for ICU 57. A better fix is to make the look-ahead numbers
724// in the state table be sequential, then we can just index an array. And the
725// table could also tell us in advance how big that array needs to be.
726//
727// Before ICU 57 there was just a single simple variable for a look-ahead match that
728// was in progress. Two rules at once did not work.
729
730static const int32_t kMaxLookaheads = 8;
731struct LookAheadResults {
732 int32_t fUsedSlotLimit;
733 int32_t fPositions[8];
734 int16_t fKeys[8];
735
3d1f044b 736 LookAheadResults() : fUsedSlotLimit(0), fPositions(), fKeys() {}
2ca993e8
A
737
738 int32_t getPosition(int16_t key) {
739 for (int32_t i=0; i<fUsedSlotLimit; ++i) {
740 if (fKeys[i] == key) {
741 return fPositions[i];
742 }
743 }
0f5d89e8
A
744 // with NLLT source rules, Latn sample and ubrk_next, we see a request for key 79 here
745 // near the end of text, when setPosition has only ever set positions for key 80 or 82.
3d1f044b 746 //UPRV_UNREACHABLE;
2ca993e8
A
747 return -1;
748 }
749
750 void setPosition(int16_t key, int32_t position) {
751 int32_t i;
752 for (i=0; i<fUsedSlotLimit; ++i) {
753 if (fKeys[i] == key) {
754 fPositions[i] = position;
755 return;
756 }
757 }
758 if (i >= kMaxLookaheads) {
3d1f044b
A
759 UPRV_UNREACHABLE;
760 i = kMaxLookaheads - 1; // Apple addition
2ca993e8
A
761 }
762 fKeys[i] = key;
763 fPositions[i] = position;
764 U_ASSERT(fUsedSlotLimit == i);
765 fUsedSlotLimit = i + 1;
766 }
767};
768
769
b75a7d8f
A
770//-----------------------------------------------------------------------------------
771//
0f5d89e8
A
772// handleNext()
773// Run the state machine to find a boundary
b75a7d8f
A
774//
775//-----------------------------------------------------------------------------------
0f5d89e8
A
776// Route handleNext calls through the following to handleNextInternal,
777// in order to handle fLineWordOpts.
778int32_t RuleBasedBreakIterator::handleNext() {
779 int32_t result = handleNextInternal();
780 while (fLineWordOpts != UBRK_LINEWORD_NORMAL) {
781 UChar32 prevChr = utext_char32At(&fText, result-1);
782 UChar32 currChr = utext_char32At(&fText, result);
783 if (currChr == U_SENTINEL || prevChr == U_SENTINEL) {
784 break;
785 }
786 if (fLineWordOpts == UBRK_LINEWORD_KEEP_HANGUL) {
787 UErrorCode status = U_ZERO_ERROR;
788 if (uscript_getScript(currChr, &status) != USCRIPT_HANGUL || uscript_getScript(prevChr, &status) != USCRIPT_HANGUL) {
789 break;
790 }
791 } else {
792 if (!u_isalpha(currChr) || !u_isalpha(prevChr)) {
793 break;
794 }
795 }
796 int32_t nextResult = handleNextInternal();
797 if (nextResult <= result) {
798 break;
799 }
800 result = nextResult;
801 }
802 return result;
803}
804
805int32_t RuleBasedBreakIterator::handleNextInternal() {
73c04bcf 806 int32_t state;
4388f060 807 uint16_t category = 0;
73c04bcf 808 RBBIRunMode mode;
0f5d89e8 809
73c04bcf
A
810 RBBIStateTableRow *row;
811 UChar32 c;
2ca993e8
A
812 LookAheadResults lookAheadMatches;
813 int32_t result = 0;
814 int32_t initialPosition = 0;
0f5d89e8 815 const RBBIStateTable *statetable = fData->fForwardTable;
2ca993e8
A
816 const char *tableData = statetable->fTableData;
817 uint32_t tableRowLen = statetable->fRowLen;
73c04bcf 818 #ifdef RBBI_DEBUG
0f5d89e8 819 if (gTrace) {
73c04bcf
A
820 RBBIDebugPuts("Handle Next pos char state category");
821 }
822 #endif
b75a7d8f 823
0f5d89e8
A
824 // handleNext alway sets the break tag value.
825 // Set the default for it.
826 fRuleStatusIndex = 0;
827
828 fDictionaryCharCount = 0;
b75a7d8f
A
829
830 // if we're already at the end of the text, return DONE.
0f5d89e8
A
831 initialPosition = fPosition;
832 UTEXT_SETNATIVEINDEX(&fText, initialPosition);
73c04bcf 833 result = initialPosition;
0f5d89e8
A
834 c = UTEXT_NEXT32(&fText);
835 if (c==U_SENTINEL) {
836 fDone = TRUE;
837 return UBRK_DONE;
b75a7d8f
A
838 }
839
73c04bcf
A
840 // Set the initial state for the state machine
841 state = START_STATE;
842 row = (RBBIStateTableRow *)
843 //(statetable->fTableData + (statetable->fRowLen * state));
844 (tableData + tableRowLen * state);
0f5d89e8
A
845
846
73c04bcf
A
847 mode = RBBI_RUN;
848 if (statetable->fFlags & RBBI_BOF_REQUIRED) {
849 category = 2;
850 mode = RBBI_START;
851 }
b75a7d8f 852
b75a7d8f
A
853
854 // loop until we reach the end of the text or transition to state 0
73c04bcf 855 //
b75a7d8f 856 for (;;) {
73c04bcf 857 if (c == U_SENTINEL) {
374ca955 858 // Reached end of input string.
73c04bcf 859 if (mode == RBBI_END) {
0f5d89e8 860 // We have already run the loop one last time with the
73c04bcf
A
861 // character set to the psueudo {eof} value. Now it is time
862 // to unconditionally bail out.
73c04bcf 863 break;
374ca955 864 }
73c04bcf
A
865 // Run the loop one last time with the fake end-of-input character category.
866 mode = RBBI_END;
867 category = 1;
b75a7d8f 868 }
b75a7d8f 869
b75a7d8f 870 //
73c04bcf
A
871 // Get the char category. An incoming category of 1 or 2 means that
872 // we are preset for doing the beginning or end of input, and
873 // that we shouldn't get a category from an actual text input character.
874 //
875 if (mode == RBBI_RUN) {
876 // look up the current character's character category, which tells us
877 // which column in the state table to look at.
878 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
879 // not the size of the character going in, which is a UChar32.
880 //
0f5d89e8 881 category = (fLatin1Cat!=NULL && c<0x100)? fLatin1Cat[c]: UTRIE2_GET16(fData->fTrie, c);
73c04bcf
A
882
883 // Check the dictionary bit in the character's category.
0f5d89e8 884 // Counter is only used by dictionary based iteration.
73c04bcf
A
885 // Chars that need to be handled by a dictionary have a flag bit set
886 // in their category values.
887 //
888 if ((category & 0x4000) != 0) {
889 fDictionaryCharCount++;
890 // And off the dictionary flag bit.
891 category &= ~0x4000;
892 }
b75a7d8f
A
893 }
894
4388f060 895 #ifdef RBBI_DEBUG
0f5d89e8 896 if (gTrace) {
3d1f044b 897 RBBIDebugPrintf(" %4" PRId64 " ", utext_getNativeIndex(&fText));
374ca955
A
898 if (0x20<=c && c<0x7f) {
899 RBBIDebugPrintf("\"%c\" ", c);
900 } else {
901 RBBIDebugPrintf("%5x ", c);
902 }
903 RBBIDebugPrintf("%3d %3d\n", state, category);
b75a7d8f 904 }
374ca955 905 #endif
b75a7d8f 906
73c04bcf
A
907 // State Transition - move machine to its next state
908 //
4388f060 909
0f5d89e8 910 // fNextState is a variable-length array.
4388f060
A
911 U_ASSERT(category<fData->fHeader->fCatCount);
912 state = row->fNextState[category]; /*Not accessing beyond memory*/
b75a7d8f 913 row = (RBBIStateTableRow *)
73c04bcf
A
914 // (statetable->fTableData + (statetable->fRowLen * state));
915 (tableData + tableRowLen * state);
b75a7d8f 916
b75a7d8f 917
b75a7d8f 918 if (row->fAccepting == -1) {
73c04bcf
A
919 // Match found, common case.
920 if (mode != RBBI_START) {
0f5d89e8 921 result = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
73c04bcf 922 }
0f5d89e8 923 fRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values.
b75a7d8f
A
924 }
925
2ca993e8
A
926 int16_t completedRule = row->fAccepting;
927 if (completedRule > 0) {
0f5d89e8 928 // Lookahead match is completed.
2ca993e8
A
929 int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule);
930 if (lookaheadResult >= 0) {
0f5d89e8
A
931 fRuleStatusIndex = row->fTagIdx;
932 fPosition = lookaheadResult;
2ca993e8 933 return lookaheadResult;
b75a7d8f 934 }
b75a7d8f 935 }
2ca993e8
A
936 int16_t rule = row->fLookAhead;
937 if (rule != 0) {
938 // At the position of a '/' in a look-ahead match. Record it.
0f5d89e8 939 int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
2ca993e8 940 lookAheadMatches.setPosition(rule, pos);
b75a7d8f
A
941 }
942
b75a7d8f 943 if (state == STOP_STATE) {
374ca955
A
944 // This is the normal exit from the lookup state machine.
945 // We have advanced through the string until it is certain that no
946 // longer match is possible, no matter what characters follow.
b75a7d8f
A
947 break;
948 }
0f5d89e8
A
949
950 // Advance to the next character.
73c04bcf
A
951 // If this is a beginning-of-input loop iteration, don't advance
952 // the input position. The next iteration will be processing the
953 // first real input character.
954 if (mode == RBBI_RUN) {
0f5d89e8 955 c = UTEXT_NEXT32(&fText);
73c04bcf
A
956 } else {
957 if (mode == RBBI_START) {
958 mode = RBBI_RUN;
959 }
960 }
b75a7d8f
A
961 }
962
374ca955 963 // The state machine is done. Check whether it found a match...
b75a7d8f 964
374ca955
A
965 // If the iterator failed to advance in the match engine, force it ahead by one.
966 // (This really indicates a defect in the break rules. They should always match
967 // at least one character.)
968 if (result == initialPosition) {
0f5d89e8
A
969 utext_setNativeIndex(&fText, initialPosition);
970 utext_next32(&fText);
971 result = (int32_t)utext_getNativeIndex(&fText);
972 fRuleStatusIndex = 0;
374ca955 973 }
b75a7d8f 974
374ca955 975 // Leave the iterator at our result position.
0f5d89e8 976 fPosition = result;
73c04bcf 977 #ifdef RBBI_DEBUG
0f5d89e8 978 if (gTrace) {
73c04bcf 979 RBBIDebugPrintf("result = %d\n\n", result);
b75a7d8f 980 }
73c04bcf 981 #endif
b75a7d8f
A
982 return result;
983}
984
985
374ca955
A
986//-----------------------------------------------------------------------------------
987//
0f5d89e8 988// handleSafePrevious()
374ca955 989//
0f5d89e8
A
990// Iterate backwards using the safe reverse rules.
991// The logic of this function is similar to handleNext(), but simpler
992// because the safe table does not require as many options.
374ca955
A
993//
994//-----------------------------------------------------------------------------------
0f5d89e8 995int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
73c04bcf 996 int32_t state;
4388f060 997 uint16_t category = 0;
73c04bcf
A
998 RBBIStateTableRow *row;
999 UChar32 c;
73c04bcf 1000 int32_t result = 0;
73c04bcf 1001
0f5d89e8
A
1002 const RBBIStateTable *stateTable = fData->fReverseTable;
1003 UTEXT_SETNATIVEINDEX(&fText, fromPosition);
73c04bcf 1004 #ifdef RBBI_DEBUG
0f5d89e8 1005 if (gTrace) {
73c04bcf
A
1006 RBBIDebugPuts("Handle Previous pos char state category");
1007 }
1008 #endif
1009
73c04bcf 1010 // if we're already at the start of the text, return DONE.
0f5d89e8 1011 if (fData == NULL || UTEXT_GETNATIVEINDEX(&fText)==0) {
73c04bcf
A
1012 return BreakIterator::DONE;
1013 }
374ca955 1014
73c04bcf 1015 // Set the initial state for the state machine
0f5d89e8 1016 c = UTEXT_PREVIOUS32(&fText);
73c04bcf 1017 state = START_STATE;
374ca955 1018 row = (RBBIStateTableRow *)
0f5d89e8 1019 (stateTable->fTableData + (stateTable->fRowLen * state));
374ca955 1020
73c04bcf
A
1021 // loop until we reach the start of the text or transition to state 0
1022 //
0f5d89e8 1023 for (; c != U_SENTINEL; c = UTEXT_PREVIOUS32(&fText)) {
374ca955 1024
0f5d89e8
A
1025 // look up the current character's character category, which tells us
1026 // which column in the state table to look at.
1027 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
1028 // not the size of the character going in, which is a UChar32.
374ca955 1029 //
0f5d89e8
A
1030 // And off the dictionary flag bit. For reverse iteration it is not used.
1031 category = UTRIE2_GET16(fData->fTrie, c);
1032 category &= ~0x4000;
374ca955
A
1033
1034 #ifdef RBBI_DEBUG
0f5d89e8
A
1035 if (gTrace) {
1036 RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(&fText));
374ca955
A
1037 if (0x20<=c && c<0x7f) {
1038 RBBIDebugPrintf("\"%c\" ", c);
1039 } else {
1040 RBBIDebugPrintf("%5x ", c);
1041 }
1042 RBBIDebugPrintf("%3d %3d\n", state, category);
1043 }
1044 #endif
1045
73c04bcf
A
1046 // State Transition - move machine to its next state
1047 //
0f5d89e8 1048 // fNextState is a variable-length array.
4388f060
A
1049 U_ASSERT(category<fData->fHeader->fCatCount);
1050 state = row->fNextState[category]; /*Not accessing beyond memory*/
374ca955 1051 row = (RBBIStateTableRow *)
0f5d89e8 1052 (stateTable->fTableData + (stateTable->fRowLen * state));
374ca955 1053
374ca955 1054 if (state == STOP_STATE) {
73c04bcf 1055 // This is the normal exit from the lookup state machine.
0f5d89e8 1056 // Transistion to state zero means we have found a safe point.
374ca955
A
1057 break;
1058 }
374ca955
A
1059 }
1060
73c04bcf 1061 // The state machine is done. Check whether it found a match...
0f5d89e8 1062 result = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
73c04bcf 1063 #ifdef RBBI_DEBUG
0f5d89e8 1064 if (gTrace) {
73c04bcf
A
1065 RBBIDebugPrintf("result = %d\n\n", result);
1066 }
1067 #endif
374ca955
A
1068 return result;
1069}
1070
b75a7d8f
A
1071//-------------------------------------------------------------------------------
1072//
1073// getRuleStatus() Return the break rule tag associated with the current
1074// iterator position. If the iterator arrived at its current
1075// position by iterating forwards, the value will have been
1076// cached by the handleNext() function.
1077//
b75a7d8f 1078//-------------------------------------------------------------------------------
b75a7d8f 1079
374ca955 1080int32_t RuleBasedBreakIterator::getRuleStatus() const {
374ca955
A
1081
1082 // fLastRuleStatusIndex indexes to the start of the appropriate status record
1083 // (the number of status values.)
1084 // This function returns the last (largest) of the array of status values.
0f5d89e8 1085 int32_t idx = fRuleStatusIndex + fData->fRuleStatusTable[fRuleStatusIndex];
374ca955
A
1086 int32_t tagVal = fData->fRuleStatusTable[idx];
1087
1088 return tagVal;
1089}
1090
1091
374ca955 1092int32_t RuleBasedBreakIterator::getRuleStatusVec(
0f5d89e8 1093 int32_t *fillInVec, int32_t capacity, UErrorCode &status) {
374ca955
A
1094 if (U_FAILURE(status)) {
1095 return 0;
1096 }
1097
0f5d89e8 1098 int32_t numVals = fData->fRuleStatusTable[fRuleStatusIndex];
374ca955
A
1099 int32_t numValsToCopy = numVals;
1100 if (numVals > capacity) {
1101 status = U_BUFFER_OVERFLOW_ERROR;
1102 numValsToCopy = capacity;
1103 }
1104 int i;
1105 for (i=0; i<numValsToCopy; i++) {
0f5d89e8 1106 fillInVec[i] = fData->fRuleStatusTable[fRuleStatusIndex + i + 1];
374ca955
A
1107 }
1108 return numVals;
1109}
1110
0f5d89e8
A
1111// Apple custom addition
1112int32_t RuleBasedBreakIterator::tokenize(int32_t maxTokens, RuleBasedTokenRange *outTokenRanges, unsigned long *outTokenFlags)
1113{
0f5d89e8
A
1114 if (fDone) {
1115 return 0;
1116 }
1117 RuleBasedTokenRange *outTokenLimit = outTokenRanges + maxTokens;
1118 RuleBasedTokenRange *outTokenP = outTokenRanges;
1119 int32_t lastOffset = fPosition;
0f5d89e8
A
1120 while (outTokenP < outTokenLimit) {
1121 // start portion from inlining populateFollowing()
1122 int32_t pos = 0;
1123 int32_t ruleStatusIdx = 0;
1124 int32_t startPos = fPosition;
1125
1126 if (fDictionaryCache->following(startPos, &pos, &ruleStatusIdx)) {
1127 fPosition = pos;
1128 fRuleStatusIndex = ruleStatusIdx;
1129 } else {
1130 pos = handleNextInternal(); // sets fRuleStatusIndex for the pos it returns, updates fPosition
1131 if (pos == UBRK_DONE) {
1132 // fDone = TRUE; already set by handleNextInternal
1133 break;
1134 }
1135 // Use current result from handleNextInternal(), including fRuleStatusIndex,
1136 // unless overridden by dictionary subdivisions
1137 fPosition = pos;
1138 if (fDictionaryCharCount > 0) {
1139 // The text segment obtained from the rules includes dictionary characters.
1140 // Subdivide it, with subdivided results going into the dictionary cache.
1141 fDictionaryCache->populateDictionary(startPos, pos, fRuleStatusIndex, fRuleStatusIndex);
1142 if (fDictionaryCache->following(startPos, &pos, &ruleStatusIdx)) {
1143 fPosition = pos;
1144 fRuleStatusIndex = ruleStatusIdx;
1145 }
1146 }
1147 }
1148 // end portion from inlining populateFollowing()
1149 int32_t flagCount = fData->fRuleStatusTable[fRuleStatusIndex];
1150 const int32_t* flagPtr = fData->fRuleStatusTable + fRuleStatusIndex + flagCount;
1151 int32_t flagSet = *flagPtr; // if -1 then skip token
1152 if (flagSet != -1) {
1153 outTokenP->location = lastOffset;
1154 outTokenP++->length = fPosition - lastOffset;
1155 if (outTokenFlags) {
1156 // flagSet should be the OR of all flags returned by getRuleStatusVec;
1157 // here we collect from high-order to low-order.
1158 while (--flagCount > 0) {
1159 flagSet |= *--flagPtr;
1160 }
1161 *outTokenFlags++ = (unsigned long)flagSet;
1162 }
1163 }
1164 lastOffset = fPosition;
1165 }
1166 return (outTokenP - outTokenRanges);
1167}
374ca955 1168
b75a7d8f
A
1169//-------------------------------------------------------------------------------
1170//
1171// getBinaryRules Access to the compiled form of the rules,
1172// for use by build system tools that save the data
1173// for standard iterator types.
1174//
1175//-------------------------------------------------------------------------------
1176const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
1177 const uint8_t *retPtr = NULL;
1178 length = 0;
1179
1180 if (fData != NULL) {
1181 retPtr = (const uint8_t *)fData->fHeader;
1182 length = fData->fHeader->fLength;
1183 }
1184 return retPtr;
1185}
1186
1187
57a6839d 1188BreakIterator * RuleBasedBreakIterator::createBufferClone(void * /*stackBuffer*/,
b75a7d8f
A
1189 int32_t &bufferSize,
1190 UErrorCode &status)
1191{
1192 if (U_FAILURE(status)){
1193 return NULL;
1194 }
1195
b75a7d8f 1196 if (bufferSize == 0) {
57a6839d 1197 bufferSize = 1; // preflighting for deprecated functionality
b75a7d8f
A
1198 return NULL;
1199 }
1200
57a6839d
A
1201 BreakIterator *clonedBI = clone();
1202 if (clonedBI == NULL) {
1203 status = U_MEMORY_ALLOCATION_ERROR;
1204 } else {
1205 status = U_SAFECLONE_ALLOCATED_WARNING;
b75a7d8f 1206 }
57a6839d 1207 return (RuleBasedBreakIterator *)clonedBI;
b75a7d8f
A
1208}
1209
73c04bcf
A
1210U_NAMESPACE_END
1211
73c04bcf 1212
0f5d89e8
A
1213static icu::UStack *gLanguageBreakFactories = nullptr;
1214static const icu::UnicodeString *gEmptyString = nullptr;
57a6839d 1215static icu::UInitOnce gLanguageBreakFactoriesInitOnce = U_INITONCE_INITIALIZER;
0f5d89e8 1216static icu::UInitOnce gRBBIInitOnce = U_INITONCE_INITIALIZER;
46f4442e 1217
73c04bcf 1218/**
0f5d89e8 1219 * Release all static memory held by breakiterator.
73c04bcf
A
1220 */
1221U_CDECL_BEGIN
0f5d89e8
A
1222static UBool U_CALLCONV rbbi_cleanup(void) {
1223 delete gLanguageBreakFactories;
1224 gLanguageBreakFactories = nullptr;
1225 delete gEmptyString;
1226 gEmptyString = nullptr;
57a6839d 1227 gLanguageBreakFactoriesInitOnce.reset();
0f5d89e8 1228 gRBBIInitOnce.reset();
73c04bcf 1229 return TRUE;
b75a7d8f 1230}
73c04bcf 1231U_CDECL_END
b75a7d8f 1232
73c04bcf
A
1233U_CDECL_BEGIN
1234static void U_CALLCONV _deleteFactory(void *obj) {
4388f060 1235 delete (icu::LanguageBreakFactory *) obj;
73c04bcf
A
1236}
1237U_CDECL_END
1238U_NAMESPACE_BEGIN
b75a7d8f 1239
0f5d89e8
A
1240static void U_CALLCONV rbbiInit() {
1241 gEmptyString = new UnicodeString();
1242 ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup);
1243}
1244
57a6839d
A
1245static void U_CALLCONV initLanguageFactories() {
1246 UErrorCode status = U_ZERO_ERROR;
1247 U_ASSERT(gLanguageBreakFactories == NULL);
1248 gLanguageBreakFactories = new UStack(_deleteFactory, NULL, status);
1249 if (gLanguageBreakFactories != NULL && U_SUCCESS(status)) {
1250 ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
1251 gLanguageBreakFactories->push(builtIn, status);
73c04bcf 1252#ifdef U_LOCAL_SERVICE_HOOK
57a6839d
A
1253 LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
1254 if (extra != NULL) {
1255 gLanguageBreakFactories->push(extra, status);
73c04bcf 1256 }
57a6839d 1257#endif
73c04bcf 1258 }
0f5d89e8 1259 ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup);
57a6839d
A
1260}
1261
1262
1263static const LanguageBreakEngine*
0f5d89e8 1264getLanguageBreakEngineFromFactory(UChar32 c)
57a6839d
A
1265{
1266 umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories);
73c04bcf
A
1267 if (gLanguageBreakFactories == NULL) {
1268 return NULL;
1269 }
0f5d89e8 1270
73c04bcf
A
1271 int32_t i = gLanguageBreakFactories->size();
1272 const LanguageBreakEngine *lbe = NULL;
1273 while (--i >= 0) {
1274 LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
0f5d89e8 1275 lbe = factory->getEngineFor(c);
73c04bcf
A
1276 if (lbe != NULL) {
1277 break;
1278 }
1279 }
1280 return lbe;
1281}
1282
1283
1284//-------------------------------------------------------------------------------
1285//
1286// getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the
51004dcb 1287// the character c.
73c04bcf
A
1288//
1289//-------------------------------------------------------------------------------
1290const LanguageBreakEngine *
1291RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
1292 const LanguageBreakEngine *lbe = NULL;
1293 UErrorCode status = U_ZERO_ERROR;
0f5d89e8 1294
73c04bcf
A
1295 if (fLanguageBreakEngines == NULL) {
1296 fLanguageBreakEngines = new UStack(status);
46f4442e 1297 if (fLanguageBreakEngines == NULL || U_FAILURE(status)) {
73c04bcf
A
1298 delete fLanguageBreakEngines;
1299 fLanguageBreakEngines = 0;
1300 return NULL;
1301 }
1302 }
0f5d89e8 1303
73c04bcf
A
1304 int32_t i = fLanguageBreakEngines->size();
1305 while (--i >= 0) {
1306 lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
0f5d89e8 1307 if (lbe->handles(c)) {
73c04bcf
A
1308 return lbe;
1309 }
1310 }
0f5d89e8 1311
73c04bcf
A
1312 // No existing dictionary took the character. See if a factory wants to
1313 // give us a new LanguageBreakEngine for this character.
0f5d89e8
A
1314 lbe = getLanguageBreakEngineFromFactory(c);
1315
73c04bcf
A
1316 // If we got one, use it and push it on our stack.
1317 if (lbe != NULL) {
1318 fLanguageBreakEngines->push((void *)lbe, status);
1319 // Even if we can't remember it, we can keep looking it up, so
1320 // return it even if the push fails.
1321 return lbe;
1322 }
0f5d89e8 1323
73c04bcf
A
1324 // No engine is forthcoming for this character. Add it to the
1325 // reject set. Create the reject break engine if needed.
1326 if (fUnhandledBreakEngine == NULL) {
1327 fUnhandledBreakEngine = new UnhandledEngine(status);
1328 if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) {
1329 status = U_MEMORY_ALLOCATION_ERROR;
0f5d89e8 1330 return nullptr;
73c04bcf
A
1331 }
1332 // Put it last so that scripts for which we have an engine get tried
1333 // first.
1334 fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
1335 // If we can't insert it, or creation failed, get rid of it
1336 if (U_FAILURE(status)) {
1337 delete fUnhandledBreakEngine;
1338 fUnhandledBreakEngine = 0;
1339 return NULL;
1340 }
1341 }
0f5d89e8 1342
73c04bcf
A
1343 // Tell the reject engine about the character; at its discretion, it may
1344 // add more than just the one character.
0f5d89e8
A
1345 fUnhandledBreakEngine->handleCharacter(c);
1346
73c04bcf
A
1347 return fUnhandledBreakEngine;
1348}
1349
0f5d89e8
A
1350void RuleBasedBreakIterator::dumpCache() {
1351 fBreakCache->dumpCache();
1352}
73c04bcf 1353
0f5d89e8
A
1354void RuleBasedBreakIterator::dumpTables() {
1355 fData->printData();
1356}
73c04bcf 1357
0f5d89e8
A
1358/**
1359 * Returns the description used to create this iterator
1360 */
73c04bcf 1361
0f5d89e8
A
1362const UnicodeString&
1363RuleBasedBreakIterator::getRules() const {
1364 if (fData != NULL) {
1365 return fData->getRuleSourceString();
1366 } else {
1367 umtx_initOnce(gRBBIInitOnce, &rbbiInit);
1368 return *gEmptyString;
1369 }
73c04bcf 1370}
b75a7d8f
A
1371
1372U_NAMESPACE_END
1373
1374#endif /* #if !UCONFIG_NO_BREAK_ITERATION */