]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
46f4442e A |
3 | /* |
4 | ********************************************************************** | |
2ca993e8 | 5 | * Copyright (C) 2008-2015, International Business Machines |
46f4442e A |
6 | * Corporation and others. All Rights Reserved. |
7 | ********************************************************************** | |
8 | * Date Name Description | |
9 | * 05/11/2008 Andy Heninger Port from Java | |
10 | ********************************************************************** | |
11 | */ | |
12 | ||
13 | #include "unicode/utypes.h" | |
14 | ||
15 | #if !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION | |
16 | ||
2ca993e8 A |
17 | #include "unicode/brkiter.h" |
18 | #include "unicode/localpointer.h" | |
46f4442e | 19 | #include "unicode/uchar.h" |
2ca993e8 | 20 | #include "unicode/unifilt.h" |
46f4442e | 21 | #include "unicode/uniset.h" |
2ca993e8 | 22 | |
46f4442e | 23 | #include "brktrans.h" |
46f4442e | 24 | #include "cmemory.h" |
2ca993e8 | 25 | #include "mutex.h" |
46f4442e A |
26 | #include "uprops.h" |
27 | #include "uinvchar.h" | |
28 | #include "util.h" | |
29 | #include "uvectr32.h" | |
30 | ||
31 | U_NAMESPACE_BEGIN | |
32 | ||
33 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator) | |
34 | ||
35 | static const UChar SPACE = 32; // ' ' | |
36 | ||
37 | ||
38 | /** | |
39 | * Constructs a transliterator with the default delimiters '{' and | |
40 | * '}'. | |
41 | */ | |
42 | BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) : | |
2ca993e8 A |
43 | Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter), |
44 | cachedBI(NULL), cachedBoundaries(NULL), fInsertion(SPACE) { | |
46f4442e A |
45 | } |
46 | ||
47 | ||
48 | /** | |
49 | * Destructor. | |
50 | */ | |
51 | BreakTransliterator::~BreakTransliterator() { | |
46f4442e A |
52 | } |
53 | ||
54 | /** | |
55 | * Copy constructor. | |
56 | */ | |
57 | BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) : | |
2ca993e8 A |
58 | Transliterator(o), cachedBI(NULL), cachedBoundaries(NULL), fInsertion(o.fInsertion) { |
59 | } | |
46f4442e A |
60 | |
61 | ||
62 | /** | |
63 | * Transliterator API. | |
64 | */ | |
65 | Transliterator* BreakTransliterator::clone(void) const { | |
66 | return new BreakTransliterator(*this); | |
67 | } | |
68 | ||
69 | /** | |
70 | * Implements {@link Transliterator#handleTransliterate}. | |
71 | */ | |
72 | void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, | |
73 | UBool isIncremental ) const { | |
74 | ||
75 | UErrorCode status = U_ZERO_ERROR; | |
2ca993e8 A |
76 | LocalPointer<BreakIterator> bi; |
77 | LocalPointer<UVector32> boundaries; | |
78 | ||
79 | { | |
80 | Mutex m; | |
81 | BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this); | |
82 | boundaries.moveFrom(nonConstThis->cachedBoundaries); | |
83 | bi.moveFrom(nonConstThis->cachedBI); | |
84 | } | |
85 | if (bi.isNull()) { | |
86 | bi.adoptInstead(BreakIterator::createWordInstance(Locale::getEnglish(), status)); | |
87 | } | |
88 | if (boundaries.isNull()) { | |
89 | boundaries.adoptInstead(new UVector32(status)); | |
90 | } | |
91 | ||
92 | if (bi.isNull() || boundaries.isNull() || U_FAILURE(status)) { | |
93 | return; | |
94 | } | |
95 | ||
46f4442e | 96 | boundaries->removeAllElements(); |
46f4442e A |
97 | UnicodeString sText = replaceableAsString(text); |
98 | bi->setText(sText); | |
99 | bi->preceding(offsets.start); | |
100 | ||
101 | // To make things much easier, we will stack the boundaries, and then insert at the end. | |
102 | // generally, we won't need too many, since we will be filtered. | |
103 | ||
104 | int32_t boundary; | |
105 | for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) { | |
106 | if (boundary == 0) continue; | |
107 | // HACK: Check to see that preceeding item was a letter | |
108 | ||
109 | UChar32 cp = sText.char32At(boundary-1); | |
110 | int type = u_charType(cp); | |
111 | //System.out.println(Integer.toString(cp,16) + " (before): " + type); | |
112 | if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; | |
113 | ||
114 | cp = sText.char32At(boundary); | |
115 | type = u_charType(cp); | |
116 | //System.out.println(Integer.toString(cp,16) + " (after): " + type); | |
117 | if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; | |
118 | ||
119 | boundaries->addElement(boundary, status); | |
729e4ab9 | 120 | // printf("Boundary at %d\n", boundary); |
46f4442e A |
121 | } |
122 | ||
123 | int delta = 0; | |
124 | int lastBoundary = 0; | |
125 | ||
126 | if (boundaries->size() != 0) { // if we found something, adjust | |
127 | delta = boundaries->size() * fInsertion.length(); | |
128 | lastBoundary = boundaries->lastElementi(); | |
129 | ||
130 | // we do this from the end backwards, so that we don't have to keep updating. | |
131 | ||
132 | while (boundaries->size() > 0) { | |
133 | boundary = boundaries->popi(); | |
134 | text.handleReplaceBetween(boundary, boundary, fInsertion); | |
135 | } | |
136 | } | |
137 | ||
138 | // Now fix up the return values | |
139 | offsets.contextLimit += delta; | |
140 | offsets.limit += delta; | |
141 | offsets.start = isIncremental ? lastBoundary + delta : offsets.limit; | |
142 | ||
2ca993e8 A |
143 | // Return break iterator & boundaries vector to the cache. |
144 | { | |
145 | Mutex m; | |
146 | BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this); | |
147 | if (nonConstThis->cachedBI.isNull()) { | |
148 | nonConstThis->cachedBI.moveFrom(bi); | |
149 | } | |
150 | if (nonConstThis->cachedBoundaries.isNull()) { | |
151 | nonConstThis->cachedBoundaries.moveFrom(boundaries); | |
152 | } | |
153 | } | |
154 | ||
46f4442e A |
155 | // TODO: do something with U_FAILURE(status); |
156 | // (need to look at transliterators overall, not just here.) | |
157 | } | |
158 | ||
159 | // | |
160 | // getInsertion() | |
161 | // | |
162 | const UnicodeString &BreakTransliterator::getInsertion() const { | |
163 | return fInsertion; | |
164 | } | |
165 | ||
166 | // | |
167 | // setInsertion() | |
168 | // | |
729e4ab9 | 169 | void BreakTransliterator::setInsertion(const UnicodeString &insertion) { |
46f4442e A |
170 | this->fInsertion = insertion; |
171 | } | |
172 | ||
46f4442e A |
173 | // |
174 | // replaceableAsString Hack to let break iterators work | |
175 | // on the replaceable text from transliterators. | |
176 | // In practice, the only real Replaceable type that we | |
177 | // will be seeing is UnicodeString, so this function | |
178 | // will normally be efficient. | |
179 | // | |
180 | UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) { | |
46f4442e | 181 | UnicodeString s; |
729e4ab9 A |
182 | UnicodeString *rs = dynamic_cast<UnicodeString *>(&r); |
183 | if (rs != NULL) { | |
184 | s = *rs; | |
185 | } else { | |
186 | r.extractBetween(0, r.length(), s); | |
187 | } | |
46f4442e A |
188 | return s; |
189 | } | |
190 | ||
191 | U_NAMESPACE_END | |
192 | ||
193 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |