]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/brktrans.cpp
ICU-57166.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / brktrans.cpp
CommitLineData
46f4442e
A
1/*
2**********************************************************************
2ca993e8 3* Copyright (C) 2008-2015, International Business Machines
46f4442e
A
4* Corporation and others. All Rights Reserved.
5**********************************************************************
6* Date Name Description
7* 05/11/2008 Andy Heninger Port from Java
8**********************************************************************
9*/
10
11#include "unicode/utypes.h"
12
13#if !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION
14
2ca993e8
A
15#include "unicode/brkiter.h"
16#include "unicode/localpointer.h"
46f4442e 17#include "unicode/uchar.h"
2ca993e8 18#include "unicode/unifilt.h"
46f4442e 19#include "unicode/uniset.h"
2ca993e8 20
46f4442e 21#include "brktrans.h"
46f4442e 22#include "cmemory.h"
2ca993e8 23#include "mutex.h"
46f4442e
A
24#include "uprops.h"
25#include "uinvchar.h"
26#include "util.h"
27#include "uvectr32.h"
28
29U_NAMESPACE_BEGIN
30
31UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator)
32
33static const UChar SPACE = 32; // ' '
34
35
36/**
37 * Constructs a transliterator with the default delimiters '{' and
38 * '}'.
39 */
40BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) :
2ca993e8
A
41 Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter),
42 cachedBI(NULL), cachedBoundaries(NULL), fInsertion(SPACE) {
46f4442e
A
43 }
44
45
46/**
47 * Destructor.
48 */
49BreakTransliterator::~BreakTransliterator() {
46f4442e
A
50}
51
52/**
53 * Copy constructor.
54 */
55BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) :
2ca993e8
A
56 Transliterator(o), cachedBI(NULL), cachedBoundaries(NULL), fInsertion(o.fInsertion) {
57}
46f4442e
A
58
59
60/**
61 * Transliterator API.
62 */
63Transliterator* BreakTransliterator::clone(void) const {
64 return new BreakTransliterator(*this);
65}
66
67/**
68 * Implements {@link Transliterator#handleTransliterate}.
69 */
70void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
71 UBool isIncremental ) const {
72
73 UErrorCode status = U_ZERO_ERROR;
2ca993e8
A
74 LocalPointer<BreakIterator> bi;
75 LocalPointer<UVector32> boundaries;
76
77 {
78 Mutex m;
79 BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this);
80 boundaries.moveFrom(nonConstThis->cachedBoundaries);
81 bi.moveFrom(nonConstThis->cachedBI);
82 }
83 if (bi.isNull()) {
84 bi.adoptInstead(BreakIterator::createWordInstance(Locale::getEnglish(), status));
85 }
86 if (boundaries.isNull()) {
87 boundaries.adoptInstead(new UVector32(status));
88 }
89
90 if (bi.isNull() || boundaries.isNull() || U_FAILURE(status)) {
91 return;
92 }
93
46f4442e 94 boundaries->removeAllElements();
46f4442e
A
95 UnicodeString sText = replaceableAsString(text);
96 bi->setText(sText);
97 bi->preceding(offsets.start);
98
99 // To make things much easier, we will stack the boundaries, and then insert at the end.
100 // generally, we won't need too many, since we will be filtered.
101
102 int32_t boundary;
103 for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) {
104 if (boundary == 0) continue;
105 // HACK: Check to see that preceeding item was a letter
106
107 UChar32 cp = sText.char32At(boundary-1);
108 int type = u_charType(cp);
109 //System.out.println(Integer.toString(cp,16) + " (before): " + type);
110 if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;
111
112 cp = sText.char32At(boundary);
113 type = u_charType(cp);
114 //System.out.println(Integer.toString(cp,16) + " (after): " + type);
115 if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;
116
117 boundaries->addElement(boundary, status);
729e4ab9 118 // printf("Boundary at %d\n", boundary);
46f4442e
A
119 }
120
121 int delta = 0;
122 int lastBoundary = 0;
123
124 if (boundaries->size() != 0) { // if we found something, adjust
125 delta = boundaries->size() * fInsertion.length();
126 lastBoundary = boundaries->lastElementi();
127
128 // we do this from the end backwards, so that we don't have to keep updating.
129
130 while (boundaries->size() > 0) {
131 boundary = boundaries->popi();
132 text.handleReplaceBetween(boundary, boundary, fInsertion);
133 }
134 }
135
136 // Now fix up the return values
137 offsets.contextLimit += delta;
138 offsets.limit += delta;
139 offsets.start = isIncremental ? lastBoundary + delta : offsets.limit;
140
2ca993e8
A
141 // Return break iterator & boundaries vector to the cache.
142 {
143 Mutex m;
144 BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this);
145 if (nonConstThis->cachedBI.isNull()) {
146 nonConstThis->cachedBI.moveFrom(bi);
147 }
148 if (nonConstThis->cachedBoundaries.isNull()) {
149 nonConstThis->cachedBoundaries.moveFrom(boundaries);
150 }
151 }
152
46f4442e
A
153 // TODO: do something with U_FAILURE(status);
154 // (need to look at transliterators overall, not just here.)
155}
156
157//
158// getInsertion()
159//
160const UnicodeString &BreakTransliterator::getInsertion() const {
161 return fInsertion;
162}
163
164//
165// setInsertion()
166//
729e4ab9 167void BreakTransliterator::setInsertion(const UnicodeString &insertion) {
46f4442e
A
168 this->fInsertion = insertion;
169}
170
46f4442e
A
171//
172// replaceableAsString Hack to let break iterators work
173// on the replaceable text from transliterators.
174// In practice, the only real Replaceable type that we
175// will be seeing is UnicodeString, so this function
176// will normally be efficient.
177//
178UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) {
46f4442e 179 UnicodeString s;
729e4ab9
A
180 UnicodeString *rs = dynamic_cast<UnicodeString *>(&r);
181 if (rs != NULL) {
182 s = *rs;
183 } else {
184 r.extractBetween(0, r.length(), s);
185 }
46f4442e
A
186 return s;
187}
188
189U_NAMESPACE_END
190
191#endif /* #if !UCONFIG_NO_TRANSLITERATION */