]>
Commit | Line | Data |
---|---|---|
46f4442e A |
1 | /* |
2 | ********************************************************************** | |
729e4ab9 | 3 | * Copyright (C) 2008-2010, International Business Machines |
46f4442e A |
4 | * Corporation and others. All Rights Reserved. |
5 | ********************************************************************** | |
6 | * Date Name Description | |
7 | * 05/11/2008 Andy Heninger Port from Java | |
8 | ********************************************************************** | |
9 | */ | |
10 | ||
11 | #include "unicode/utypes.h" | |
12 | ||
13 | #if !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION | |
14 | ||
15 | #include "unicode/unifilt.h" | |
16 | #include "unicode/uchar.h" | |
17 | #include "unicode/uniset.h" | |
18 | #include "unicode/brkiter.h" | |
19 | #include "brktrans.h" | |
20 | #include "unicode/uchar.h" | |
21 | #include "cmemory.h" | |
22 | #include "uprops.h" | |
23 | #include "uinvchar.h" | |
24 | #include "util.h" | |
25 | #include "uvectr32.h" | |
26 | ||
27 | U_NAMESPACE_BEGIN | |
28 | ||
29 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator) | |
30 | ||
31 | static const UChar SPACE = 32; // ' ' | |
32 | ||
33 | ||
34 | /** | |
35 | * Constructs a transliterator with the default delimiters '{' and | |
36 | * '}'. | |
37 | */ | |
38 | BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) : | |
39 | Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter), | |
40 | fInsertion(SPACE) { | |
41 | bi = NULL; | |
42 | UErrorCode status = U_ZERO_ERROR; | |
43 | boundaries = new UVector32(status); | |
44 | } | |
45 | ||
46 | ||
47 | /** | |
48 | * Destructor. | |
49 | */ | |
50 | BreakTransliterator::~BreakTransliterator() { | |
51 | delete bi; | |
52 | bi = NULL; | |
53 | delete boundaries; | |
54 | boundaries = NULL; | |
55 | } | |
56 | ||
57 | /** | |
58 | * Copy constructor. | |
59 | */ | |
60 | BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) : | |
61 | Transliterator(o) { | |
62 | bi = NULL; | |
63 | if (o.bi != NULL) { | |
64 | bi = o.bi->clone(); | |
65 | } | |
66 | fInsertion = o.fInsertion; | |
67 | UErrorCode status = U_ZERO_ERROR; | |
68 | boundaries = new UVector32(status); | |
69 | } | |
70 | ||
71 | ||
72 | /** | |
73 | * Transliterator API. | |
74 | */ | |
75 | Transliterator* BreakTransliterator::clone(void) const { | |
76 | return new BreakTransliterator(*this); | |
77 | } | |
78 | ||
79 | /** | |
80 | * Implements {@link Transliterator#handleTransliterate}. | |
81 | */ | |
82 | void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, | |
83 | UBool isIncremental ) const { | |
84 | ||
85 | UErrorCode status = U_ZERO_ERROR; | |
86 | boundaries->removeAllElements(); | |
87 | BreakTransliterator *nonConstThis = (BreakTransliterator *)this; | |
88 | nonConstThis->getBreakIterator(); // Lazy-create it if necessary | |
89 | UnicodeString sText = replaceableAsString(text); | |
90 | bi->setText(sText); | |
91 | bi->preceding(offsets.start); | |
92 | ||
93 | // To make things much easier, we will stack the boundaries, and then insert at the end. | |
94 | // generally, we won't need too many, since we will be filtered. | |
95 | ||
96 | int32_t boundary; | |
97 | for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) { | |
98 | if (boundary == 0) continue; | |
99 | // HACK: Check to see that preceeding item was a letter | |
100 | ||
101 | UChar32 cp = sText.char32At(boundary-1); | |
102 | int type = u_charType(cp); | |
103 | //System.out.println(Integer.toString(cp,16) + " (before): " + type); | |
104 | if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; | |
105 | ||
106 | cp = sText.char32At(boundary); | |
107 | type = u_charType(cp); | |
108 | //System.out.println(Integer.toString(cp,16) + " (after): " + type); | |
109 | if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; | |
110 | ||
111 | boundaries->addElement(boundary, status); | |
729e4ab9 | 112 | // printf("Boundary at %d\n", boundary); |
46f4442e A |
113 | } |
114 | ||
115 | int delta = 0; | |
116 | int lastBoundary = 0; | |
117 | ||
118 | if (boundaries->size() != 0) { // if we found something, adjust | |
119 | delta = boundaries->size() * fInsertion.length(); | |
120 | lastBoundary = boundaries->lastElementi(); | |
121 | ||
122 | // we do this from the end backwards, so that we don't have to keep updating. | |
123 | ||
124 | while (boundaries->size() > 0) { | |
125 | boundary = boundaries->popi(); | |
126 | text.handleReplaceBetween(boundary, boundary, fInsertion); | |
127 | } | |
128 | } | |
129 | ||
130 | // Now fix up the return values | |
131 | offsets.contextLimit += delta; | |
132 | offsets.limit += delta; | |
133 | offsets.start = isIncremental ? lastBoundary + delta : offsets.limit; | |
134 | ||
135 | // TODO: do something with U_FAILURE(status); | |
136 | // (need to look at transliterators overall, not just here.) | |
137 | } | |
138 | ||
139 | // | |
140 | // getInsertion() | |
141 | // | |
142 | const UnicodeString &BreakTransliterator::getInsertion() const { | |
143 | return fInsertion; | |
144 | } | |
145 | ||
146 | // | |
147 | // setInsertion() | |
148 | // | |
729e4ab9 | 149 | void BreakTransliterator::setInsertion(const UnicodeString &insertion) { |
46f4442e A |
150 | this->fInsertion = insertion; |
151 | } | |
152 | ||
153 | // | |
154 | // getBreakIterator Lazily create the break iterator if it does | |
155 | // not already exist. Copied from Java, probably | |
156 | // better to just create it in the constructor. | |
157 | // | |
158 | BreakIterator *BreakTransliterator::getBreakIterator() { | |
159 | UErrorCode status = U_ZERO_ERROR; | |
160 | if (bi == NULL) { | |
161 | // Note: Thai breaking behavior is universal, it is not | |
162 | // tied to the Thai locale. | |
163 | bi = BreakIterator::createWordInstance(Locale::getEnglish(), status); | |
164 | } | |
165 | return bi; | |
166 | } | |
167 | ||
168 | // | |
169 | // replaceableAsString Hack to let break iterators work | |
170 | // on the replaceable text from transliterators. | |
171 | // In practice, the only real Replaceable type that we | |
172 | // will be seeing is UnicodeString, so this function | |
173 | // will normally be efficient. | |
174 | // | |
175 | UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) { | |
46f4442e | 176 | UnicodeString s; |
729e4ab9 A |
177 | UnicodeString *rs = dynamic_cast<UnicodeString *>(&r); |
178 | if (rs != NULL) { | |
179 | s = *rs; | |
180 | } else { | |
181 | r.extractBetween(0, r.length(), s); | |
182 | } | |
46f4442e A |
183 | return s; |
184 | } | |
185 | ||
186 | U_NAMESPACE_END | |
187 | ||
188 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |