]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/brktrans.cpp
ICU-62141.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / brktrans.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
46f4442e
A
3/*
4**********************************************************************
2ca993e8 5* Copyright (C) 2008-2015, International Business Machines
46f4442e
A
6* Corporation and others. All Rights Reserved.
7**********************************************************************
8* Date Name Description
9* 05/11/2008 Andy Heninger Port from Java
10**********************************************************************
11*/
12
13#include "unicode/utypes.h"
14
15#if !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION
16
2ca993e8
A
17#include "unicode/brkiter.h"
18#include "unicode/localpointer.h"
46f4442e 19#include "unicode/uchar.h"
2ca993e8 20#include "unicode/unifilt.h"
46f4442e 21#include "unicode/uniset.h"
2ca993e8 22
46f4442e 23#include "brktrans.h"
46f4442e 24#include "cmemory.h"
2ca993e8 25#include "mutex.h"
46f4442e
A
26#include "uprops.h"
27#include "uinvchar.h"
28#include "util.h"
29#include "uvectr32.h"
30
31U_NAMESPACE_BEGIN
32
33UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator)
34
35static const UChar SPACE = 32; // ' '
36
37
38/**
39 * Constructs a transliterator with the default delimiters '{' and
40 * '}'.
41 */
42BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) :
2ca993e8
A
43 Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter),
44 cachedBI(NULL), cachedBoundaries(NULL), fInsertion(SPACE) {
46f4442e
A
45 }
46
47
48/**
49 * Destructor.
50 */
51BreakTransliterator::~BreakTransliterator() {
46f4442e
A
52}
53
54/**
55 * Copy constructor.
56 */
57BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) :
2ca993e8
A
58 Transliterator(o), cachedBI(NULL), cachedBoundaries(NULL), fInsertion(o.fInsertion) {
59}
46f4442e
A
60
61
62/**
63 * Transliterator API.
64 */
65Transliterator* BreakTransliterator::clone(void) const {
66 return new BreakTransliterator(*this);
67}
68
69/**
70 * Implements {@link Transliterator#handleTransliterate}.
71 */
72void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
73 UBool isIncremental ) const {
74
75 UErrorCode status = U_ZERO_ERROR;
2ca993e8
A
76 LocalPointer<BreakIterator> bi;
77 LocalPointer<UVector32> boundaries;
78
79 {
80 Mutex m;
81 BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this);
82 boundaries.moveFrom(nonConstThis->cachedBoundaries);
83 bi.moveFrom(nonConstThis->cachedBI);
84 }
85 if (bi.isNull()) {
86 bi.adoptInstead(BreakIterator::createWordInstance(Locale::getEnglish(), status));
87 }
88 if (boundaries.isNull()) {
89 boundaries.adoptInstead(new UVector32(status));
90 }
91
92 if (bi.isNull() || boundaries.isNull() || U_FAILURE(status)) {
93 return;
94 }
95
46f4442e 96 boundaries->removeAllElements();
46f4442e
A
97 UnicodeString sText = replaceableAsString(text);
98 bi->setText(sText);
99 bi->preceding(offsets.start);
100
101 // To make things much easier, we will stack the boundaries, and then insert at the end.
102 // generally, we won't need too many, since we will be filtered.
103
104 int32_t boundary;
105 for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) {
106 if (boundary == 0) continue;
107 // HACK: Check to see that preceeding item was a letter
108
109 UChar32 cp = sText.char32At(boundary-1);
110 int type = u_charType(cp);
111 //System.out.println(Integer.toString(cp,16) + " (before): " + type);
112 if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;
113
114 cp = sText.char32At(boundary);
115 type = u_charType(cp);
116 //System.out.println(Integer.toString(cp,16) + " (after): " + type);
117 if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;
118
119 boundaries->addElement(boundary, status);
729e4ab9 120 // printf("Boundary at %d\n", boundary);
46f4442e
A
121 }
122
123 int delta = 0;
124 int lastBoundary = 0;
125
126 if (boundaries->size() != 0) { // if we found something, adjust
127 delta = boundaries->size() * fInsertion.length();
128 lastBoundary = boundaries->lastElementi();
129
130 // we do this from the end backwards, so that we don't have to keep updating.
131
132 while (boundaries->size() > 0) {
133 boundary = boundaries->popi();
134 text.handleReplaceBetween(boundary, boundary, fInsertion);
135 }
136 }
137
138 // Now fix up the return values
139 offsets.contextLimit += delta;
140 offsets.limit += delta;
141 offsets.start = isIncremental ? lastBoundary + delta : offsets.limit;
142
2ca993e8
A
143 // Return break iterator & boundaries vector to the cache.
144 {
145 Mutex m;
146 BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this);
147 if (nonConstThis->cachedBI.isNull()) {
148 nonConstThis->cachedBI.moveFrom(bi);
149 }
150 if (nonConstThis->cachedBoundaries.isNull()) {
151 nonConstThis->cachedBoundaries.moveFrom(boundaries);
152 }
153 }
154
46f4442e
A
155 // TODO: do something with U_FAILURE(status);
156 // (need to look at transliterators overall, not just here.)
157}
158
159//
160// getInsertion()
161//
162const UnicodeString &BreakTransliterator::getInsertion() const {
163 return fInsertion;
164}
165
166//
167// setInsertion()
168//
729e4ab9 169void BreakTransliterator::setInsertion(const UnicodeString &insertion) {
46f4442e
A
170 this->fInsertion = insertion;
171}
172
46f4442e
A
173//
174// replaceableAsString Hack to let break iterators work
175// on the replaceable text from transliterators.
176// In practice, the only real Replaceable type that we
177// will be seeing is UnicodeString, so this function
178// will normally be efficient.
179//
180UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) {
46f4442e 181 UnicodeString s;
729e4ab9
A
182 UnicodeString *rs = dynamic_cast<UnicodeString *>(&r);
183 if (rs != NULL) {
184 s = *rs;
185 } else {
186 r.extractBetween(0, r.length(), s);
187 }
46f4442e
A
188 return s;
189}
190
191U_NAMESPACE_END
192
193#endif /* #if !UCONFIG_NO_TRANSLITERATION */