]>
Commit | Line | Data |
---|---|---|
1 | // © 2016 and later: Unicode, Inc. and others. | |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
3 | /* | |
4 | ********************************************************************** | |
5 | * Copyright (C) 2008-2015, International Business Machines | |
6 | * Corporation and others. All Rights Reserved. | |
7 | ********************************************************************** | |
8 | * Date Name Description | |
9 | * 05/11/2008 Andy Heninger Port from Java | |
10 | ********************************************************************** | |
11 | */ | |
12 | ||
13 | #include <utility> | |
14 | ||
15 | #include "unicode/utypes.h" | |
16 | ||
17 | #if !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION | |
18 | ||
19 | #include "unicode/brkiter.h" | |
20 | #include "unicode/localpointer.h" | |
21 | #include "unicode/uchar.h" | |
22 | #include "unicode/unifilt.h" | |
23 | #include "unicode/uniset.h" | |
24 | ||
25 | #include "brktrans.h" | |
26 | #include "cmemory.h" | |
27 | #include "mutex.h" | |
28 | #include "uprops.h" | |
29 | #include "uinvchar.h" | |
30 | #include "util.h" | |
31 | #include "uvectr32.h" | |
32 | ||
33 | U_NAMESPACE_BEGIN | |
34 | ||
35 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator) | |
36 | ||
37 | static const UChar SPACE = 32; // ' ' | |
38 | ||
39 | ||
40 | /** | |
41 | * Constructs a transliterator with the default delimiters '{' and | |
42 | * '}'. | |
43 | */ | |
44 | BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) : | |
45 | Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter), | |
46 | cachedBI(NULL), cachedBoundaries(NULL), fInsertion(SPACE) { | |
47 | } | |
48 | ||
49 | ||
50 | /** | |
51 | * Destructor. | |
52 | */ | |
53 | BreakTransliterator::~BreakTransliterator() { | |
54 | } | |
55 | ||
56 | /** | |
57 | * Copy constructor. | |
58 | */ | |
59 | BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) : | |
60 | Transliterator(o), cachedBI(NULL), cachedBoundaries(NULL), fInsertion(o.fInsertion) { | |
61 | } | |
62 | ||
63 | ||
64 | /** | |
65 | * Transliterator API. | |
66 | */ | |
67 | BreakTransliterator* BreakTransliterator::clone() const { | |
68 | return new BreakTransliterator(*this); | |
69 | } | |
70 | ||
71 | /** | |
72 | * Implements {@link Transliterator#handleTransliterate}. | |
73 | */ | |
74 | void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, | |
75 | UBool isIncremental ) const { | |
76 | ||
77 | UErrorCode status = U_ZERO_ERROR; | |
78 | LocalPointer<BreakIterator> bi; | |
79 | LocalPointer<UVector32> boundaries; | |
80 | ||
81 | { | |
82 | Mutex m; | |
83 | BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this); | |
84 | boundaries = std::move(nonConstThis->cachedBoundaries); | |
85 | bi = std::move(nonConstThis->cachedBI); | |
86 | } | |
87 | if (bi.isNull()) { | |
88 | bi.adoptInstead(BreakIterator::createWordInstance(Locale::getEnglish(), status)); | |
89 | } | |
90 | if (boundaries.isNull()) { | |
91 | boundaries.adoptInstead(new UVector32(status)); | |
92 | } | |
93 | ||
94 | if (bi.isNull() || boundaries.isNull() || U_FAILURE(status)) { | |
95 | return; | |
96 | } | |
97 | ||
98 | boundaries->removeAllElements(); | |
99 | UnicodeString sText = replaceableAsString(text); | |
100 | bi->setText(sText); | |
101 | bi->preceding(offsets.start); | |
102 | ||
103 | // To make things much easier, we will stack the boundaries, and then insert at the end. | |
104 | // generally, we won't need too many, since we will be filtered. | |
105 | ||
106 | int32_t boundary; | |
107 | for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) { | |
108 | if (boundary == 0) continue; | |
109 | // HACK: Check to see that preceeding item was a letter | |
110 | ||
111 | UChar32 cp = sText.char32At(boundary-1); | |
112 | int type = u_charType(cp); | |
113 | //System.out.println(Integer.toString(cp,16) + " (before): " + type); | |
114 | if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; | |
115 | ||
116 | cp = sText.char32At(boundary); | |
117 | type = u_charType(cp); | |
118 | //System.out.println(Integer.toString(cp,16) + " (after): " + type); | |
119 | if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; | |
120 | ||
121 | boundaries->addElement(boundary, status); | |
122 | // printf("Boundary at %d\n", boundary); | |
123 | } | |
124 | ||
125 | int delta = 0; | |
126 | int lastBoundary = 0; | |
127 | ||
128 | if (boundaries->size() != 0) { // if we found something, adjust | |
129 | delta = boundaries->size() * fInsertion.length(); | |
130 | lastBoundary = boundaries->lastElementi(); | |
131 | ||
132 | // we do this from the end backwards, so that we don't have to keep updating. | |
133 | ||
134 | while (boundaries->size() > 0) { | |
135 | boundary = boundaries->popi(); | |
136 | text.handleReplaceBetween(boundary, boundary, fInsertion); | |
137 | } | |
138 | } | |
139 | ||
140 | // Now fix up the return values | |
141 | offsets.contextLimit += delta; | |
142 | offsets.limit += delta; | |
143 | offsets.start = isIncremental ? lastBoundary + delta : offsets.limit; | |
144 | ||
145 | // Return break iterator & boundaries vector to the cache. | |
146 | { | |
147 | Mutex m; | |
148 | BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this); | |
149 | if (nonConstThis->cachedBI.isNull()) { | |
150 | nonConstThis->cachedBI = std::move(bi); | |
151 | } | |
152 | if (nonConstThis->cachedBoundaries.isNull()) { | |
153 | nonConstThis->cachedBoundaries = std::move(boundaries); | |
154 | } | |
155 | } | |
156 | ||
157 | // TODO: do something with U_FAILURE(status); | |
158 | // (need to look at transliterators overall, not just here.) | |
159 | } | |
160 | ||
161 | // | |
162 | // getInsertion() | |
163 | // | |
164 | const UnicodeString &BreakTransliterator::getInsertion() const { | |
165 | return fInsertion; | |
166 | } | |
167 | ||
168 | // | |
169 | // setInsertion() | |
170 | // | |
171 | void BreakTransliterator::setInsertion(const UnicodeString &insertion) { | |
172 | this->fInsertion = insertion; | |
173 | } | |
174 | ||
175 | // | |
176 | // replaceableAsString Hack to let break iterators work | |
177 | // on the replaceable text from transliterators. | |
178 | // In practice, the only real Replaceable type that we | |
179 | // will be seeing is UnicodeString, so this function | |
180 | // will normally be efficient. | |
181 | // | |
182 | UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) { | |
183 | UnicodeString s; | |
184 | UnicodeString *rs = dynamic_cast<UnicodeString *>(&r); | |
185 | if (rs != NULL) { | |
186 | s = *rs; | |
187 | } else { | |
188 | r.extractBetween(0, r.length(), s); | |
189 | } | |
190 | return s; | |
191 | } | |
192 | ||
193 | U_NAMESPACE_END | |
194 | ||
195 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |