]>
Commit | Line | Data |
---|---|---|
46f4442e A |
1 | /* |
2 | ********************************************************************** | |
2ca993e8 | 3 | * Copyright (C) 2008-2015, International Business Machines |
46f4442e A |
4 | * Corporation and others. All Rights Reserved. |
5 | ********************************************************************** | |
6 | * Date Name Description | |
7 | * 05/11/2008 Andy Heninger Port from Java | |
8 | ********************************************************************** | |
9 | */ | |
10 | ||
11 | #include "unicode/utypes.h" | |
12 | ||
13 | #if !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION | |
14 | ||
2ca993e8 A |
15 | #include "unicode/brkiter.h" |
16 | #include "unicode/localpointer.h" | |
46f4442e | 17 | #include "unicode/uchar.h" |
2ca993e8 | 18 | #include "unicode/unifilt.h" |
46f4442e | 19 | #include "unicode/uniset.h" |
2ca993e8 | 20 | |
46f4442e | 21 | #include "brktrans.h" |
46f4442e | 22 | #include "cmemory.h" |
2ca993e8 | 23 | #include "mutex.h" |
46f4442e A |
24 | #include "uprops.h" |
25 | #include "uinvchar.h" | |
26 | #include "util.h" | |
27 | #include "uvectr32.h" | |
28 | ||
29 | U_NAMESPACE_BEGIN | |
30 | ||
31 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator) | |
32 | ||
33 | static const UChar SPACE = 32; // ' ' | |
34 | ||
35 | ||
36 | /** | |
37 | * Constructs a transliterator with the default delimiters '{' and | |
38 | * '}'. | |
39 | */ | |
40 | BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) : | |
2ca993e8 A |
41 | Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter), |
42 | cachedBI(NULL), cachedBoundaries(NULL), fInsertion(SPACE) { | |
46f4442e A |
43 | } |
44 | ||
45 | ||
46 | /** | |
47 | * Destructor. | |
48 | */ | |
49 | BreakTransliterator::~BreakTransliterator() { | |
46f4442e A |
50 | } |
51 | ||
52 | /** | |
53 | * Copy constructor. | |
54 | */ | |
55 | BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) : | |
2ca993e8 A |
56 | Transliterator(o), cachedBI(NULL), cachedBoundaries(NULL), fInsertion(o.fInsertion) { |
57 | } | |
46f4442e A |
58 | |
59 | ||
60 | /** | |
61 | * Transliterator API. | |
62 | */ | |
63 | Transliterator* BreakTransliterator::clone(void) const { | |
64 | return new BreakTransliterator(*this); | |
65 | } | |
66 | ||
67 | /** | |
68 | * Implements {@link Transliterator#handleTransliterate}. | |
69 | */ | |
70 | void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, | |
71 | UBool isIncremental ) const { | |
72 | ||
73 | UErrorCode status = U_ZERO_ERROR; | |
2ca993e8 A |
74 | LocalPointer<BreakIterator> bi; |
75 | LocalPointer<UVector32> boundaries; | |
76 | ||
77 | { | |
78 | Mutex m; | |
79 | BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this); | |
80 | boundaries.moveFrom(nonConstThis->cachedBoundaries); | |
81 | bi.moveFrom(nonConstThis->cachedBI); | |
82 | } | |
83 | if (bi.isNull()) { | |
84 | bi.adoptInstead(BreakIterator::createWordInstance(Locale::getEnglish(), status)); | |
85 | } | |
86 | if (boundaries.isNull()) { | |
87 | boundaries.adoptInstead(new UVector32(status)); | |
88 | } | |
89 | ||
90 | if (bi.isNull() || boundaries.isNull() || U_FAILURE(status)) { | |
91 | return; | |
92 | } | |
93 | ||
46f4442e | 94 | boundaries->removeAllElements(); |
46f4442e A |
95 | UnicodeString sText = replaceableAsString(text); |
96 | bi->setText(sText); | |
97 | bi->preceding(offsets.start); | |
98 | ||
99 | // To make things much easier, we will stack the boundaries, and then insert at the end. | |
100 | // generally, we won't need too many, since we will be filtered. | |
101 | ||
102 | int32_t boundary; | |
103 | for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) { | |
104 | if (boundary == 0) continue; | |
105 | // HACK: Check to see that preceeding item was a letter | |
106 | ||
107 | UChar32 cp = sText.char32At(boundary-1); | |
108 | int type = u_charType(cp); | |
109 | //System.out.println(Integer.toString(cp,16) + " (before): " + type); | |
110 | if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; | |
111 | ||
112 | cp = sText.char32At(boundary); | |
113 | type = u_charType(cp); | |
114 | //System.out.println(Integer.toString(cp,16) + " (after): " + type); | |
115 | if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; | |
116 | ||
117 | boundaries->addElement(boundary, status); | |
729e4ab9 | 118 | // printf("Boundary at %d\n", boundary); |
46f4442e A |
119 | } |
120 | ||
121 | int delta = 0; | |
122 | int lastBoundary = 0; | |
123 | ||
124 | if (boundaries->size() != 0) { // if we found something, adjust | |
125 | delta = boundaries->size() * fInsertion.length(); | |
126 | lastBoundary = boundaries->lastElementi(); | |
127 | ||
128 | // we do this from the end backwards, so that we don't have to keep updating. | |
129 | ||
130 | while (boundaries->size() > 0) { | |
131 | boundary = boundaries->popi(); | |
132 | text.handleReplaceBetween(boundary, boundary, fInsertion); | |
133 | } | |
134 | } | |
135 | ||
136 | // Now fix up the return values | |
137 | offsets.contextLimit += delta; | |
138 | offsets.limit += delta; | |
139 | offsets.start = isIncremental ? lastBoundary + delta : offsets.limit; | |
140 | ||
2ca993e8 A |
141 | // Return break iterator & boundaries vector to the cache. |
142 | { | |
143 | Mutex m; | |
144 | BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this); | |
145 | if (nonConstThis->cachedBI.isNull()) { | |
146 | nonConstThis->cachedBI.moveFrom(bi); | |
147 | } | |
148 | if (nonConstThis->cachedBoundaries.isNull()) { | |
149 | nonConstThis->cachedBoundaries.moveFrom(boundaries); | |
150 | } | |
151 | } | |
152 | ||
46f4442e A |
153 | // TODO: do something with U_FAILURE(status); |
154 | // (need to look at transliterators overall, not just here.) | |
155 | } | |
156 | ||
157 | // | |
158 | // getInsertion() | |
159 | // | |
160 | const UnicodeString &BreakTransliterator::getInsertion() const { | |
161 | return fInsertion; | |
162 | } | |
163 | ||
164 | // | |
165 | // setInsertion() | |
166 | // | |
729e4ab9 | 167 | void BreakTransliterator::setInsertion(const UnicodeString &insertion) { |
46f4442e A |
168 | this->fInsertion = insertion; |
169 | } | |
170 | ||
46f4442e A |
171 | // |
172 | // replaceableAsString Hack to let break iterators work | |
173 | // on the replaceable text from transliterators. | |
174 | // In practice, the only real Replaceable type that we | |
175 | // will be seeing is UnicodeString, so this function | |
176 | // will normally be efficient. | |
177 | // | |
178 | UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) { | |
46f4442e | 179 | UnicodeString s; |
729e4ab9 A |
180 | UnicodeString *rs = dynamic_cast<UnicodeString *>(&r); |
181 | if (rs != NULL) { | |
182 | s = *rs; | |
183 | } else { | |
184 | r.extractBetween(0, r.length(), s); | |
185 | } | |
46f4442e A |
186 | return s; |
187 | } | |
188 | ||
189 | U_NAMESPACE_END | |
190 | ||
191 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |