]>
git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/brktrans.cpp
2 **********************************************************************
3 * Copyright (C) 2008, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * Date Name Description
7 * 05/11/2008 Andy Heninger Port from Java
8 **********************************************************************
11 #include "unicode/utypes.h"
13 #if !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION
15 #include "unicode/unifilt.h"
16 #include "unicode/uchar.h"
17 #include "unicode/uniset.h"
18 #include "unicode/brkiter.h"
20 #include "unicode/uchar.h"
29 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator
)
31 static const UChar SPACE
= 32; // ' '
35 * Constructs a transliterator with the default delimiters '{' and
38 BreakTransliterator::BreakTransliterator(UnicodeFilter
* adoptedFilter
) :
39 Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter
),
42 UErrorCode status
= U_ZERO_ERROR
;
43 boundaries
= new UVector32(status
);
50 BreakTransliterator::~BreakTransliterator() {
60 BreakTransliterator::BreakTransliterator(const BreakTransliterator
& o
) :
66 fInsertion
= o
.fInsertion
;
67 UErrorCode status
= U_ZERO_ERROR
;
68 boundaries
= new UVector32(status
);
75 Transliterator
* BreakTransliterator::clone(void) const {
76 return new BreakTransliterator(*this);
80 * Implements {@link Transliterator#handleTransliterate}.
82 void BreakTransliterator::handleTransliterate(Replaceable
& text
, UTransPosition
& offsets
,
83 UBool isIncremental
) const {
85 UErrorCode status
= U_ZERO_ERROR
;
86 boundaries
->removeAllElements();
87 BreakTransliterator
*nonConstThis
= (BreakTransliterator
*)this;
88 nonConstThis
->getBreakIterator(); // Lazy-create it if necessary
89 UnicodeString sText
= replaceableAsString(text
);
91 bi
->preceding(offsets
.start
);
93 // To make things much easier, we will stack the boundaries, and then insert at the end.
94 // generally, we won't need too many, since we will be filtered.
97 for(boundary
= bi
->next(); boundary
!= UBRK_DONE
&& boundary
< offsets
.limit
; boundary
= bi
->next()) {
98 if (boundary
== 0) continue;
99 // HACK: Check to see that preceeding item was a letter
101 UChar32 cp
= sText
.char32At(boundary
-1);
102 int type
= u_charType(cp
);
103 //System.out.println(Integer.toString(cp,16) + " (before): " + type);
104 if ((U_MASK(type
) & (U_GC_L_MASK
| U_GC_M_MASK
)) == 0) continue;
106 cp
= sText
.char32At(boundary
);
107 type
= u_charType(cp
);
108 //System.out.println(Integer.toString(cp,16) + " (after): " + type);
109 if ((U_MASK(type
) & (U_GC_L_MASK
| U_GC_M_MASK
)) == 0) continue;
111 boundaries
->addElement(boundary
, status
);
112 //System.out.println(boundary);
116 int lastBoundary
= 0;
118 if (boundaries
->size() != 0) { // if we found something, adjust
119 delta
= boundaries
->size() * fInsertion
.length();
120 lastBoundary
= boundaries
->lastElementi();
122 // we do this from the end backwards, so that we don't have to keep updating.
124 while (boundaries
->size() > 0) {
125 boundary
= boundaries
->popi();
126 text
.handleReplaceBetween(boundary
, boundary
, fInsertion
);
130 // Now fix up the return values
131 offsets
.contextLimit
+= delta
;
132 offsets
.limit
+= delta
;
133 offsets
.start
= isIncremental
? lastBoundary
+ delta
: offsets
.limit
;
135 // TODO: do something with U_FAILURE(status);
136 // (need to look at transliterators overall, not just here.)
142 const UnicodeString
&BreakTransliterator::getInsertion() const {
149 void BreakTransliterator::setInsertsion(const UnicodeString
&insertion
) {
150 this->fInsertion
= insertion
;
154 // getBreakIterator Lazily create the break iterator if it does
155 // not already exist. Copied from Java, probably
156 // better to just create it in the constructor.
158 BreakIterator
*BreakTransliterator::getBreakIterator() {
159 UErrorCode status
= U_ZERO_ERROR
;
161 // Note: Thai breaking behavior is universal, it is not
162 // tied to the Thai locale.
163 bi
= BreakIterator::createWordInstance(Locale::getEnglish(), status
);
169 // replaceableAsString Hack to let break iterators work
170 // on the replaceable text from transliterators.
171 // In practice, the only real Replaceable type that we
172 // will be seeing is UnicodeString, so this function
173 // will normally be efficient.
175 UnicodeString
BreakTransliterator::replaceableAsString(Replaceable
&r
) {
176 if (r
.getDynamicClassID() == UnicodeString::getStaticClassID()) {
177 return (UnicodeString
&) r
;
180 r
.extractBetween(0, r
.length(), s
);
186 #endif /* #if !UCONFIG_NO_TRANSLITERATION */