icuSources/i18n/strmatch.cpp

   1 /*
   2 **********************************************************************
   3 *   Copyright (c) 2001-2011, International Business Machines Corporation
   4 *   and others.  All Rights Reserved.
   5 **********************************************************************
   6 *   Date        Name        Description
   7 *   07/23/01    aliu        Creation.
   8 **********************************************************************
   9 */
  10
  11 #include "unicode/utypes.h"
  12
  13 #if !UCONFIG_NO_TRANSLITERATION
  14
  15 #include "strmatch.h"
  16 #include "rbt_data.h"
  17 #include "util.h"
  18 #include "unicode/uniset.h"
  19 #include "unicode/utf16.h"
  20
  21 U_NAMESPACE_BEGIN
  22
  23 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher)
  24
  25 StringMatcher::StringMatcher(const UnicodeString& theString,
  26                              int32_t start,
  27                              int32_t limit,
  28                              int32_t segmentNum,
  29                              const TransliterationRuleData& theData) :
  30     data(&theData),
  31     segmentNumber(segmentNum),
  32     matchStart(-1),
  33     matchLimit(-1)
  34 {
  35     theString.extractBetween(start, limit, pattern);
  36 }
  37
  38 StringMatcher::StringMatcher(const StringMatcher& o) :
  39     UnicodeFunctor(o),
  40     UnicodeMatcher(o),
  41     UnicodeReplacer(o),
  42     pattern(o.pattern),
  43     data(o.data),
  44     segmentNumber(o.segmentNumber),
  45     matchStart(o.matchStart),
  46     matchLimit(o.matchLimit)
  47 {
  48 }
  49
  50 /**
  51  * Destructor
  52  */
  53 StringMatcher::~StringMatcher() {
  54 }
  55
  56 /**
  57  * Implement UnicodeFunctor
  58  */
  59 UnicodeFunctor* StringMatcher::clone() const {
  60     return new StringMatcher(*this);
  61 }
  62
  63 /**
  64  * UnicodeFunctor API.  Cast 'this' to a UnicodeMatcher* pointer
  65  * and return the pointer.
  66  */
  67 UnicodeMatcher* StringMatcher::toMatcher() const {
  68     return (UnicodeMatcher*) this;
  69 }
  70
  71 /**
  72  * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer
  73  * and return the pointer.
  74  */
  75 UnicodeReplacer* StringMatcher::toReplacer() const {
  76     return (UnicodeReplacer*) this;
  77 }
  78
  79 /**
  80  * Implement UnicodeMatcher
  81  */
  82 UMatchDegree StringMatcher::matches(const Replaceable& text,
  83                                     int32_t& offset,
  84                                     int32_t limit,
  85                                     UBool incremental) {
  86     int32_t i;
  87     int32_t cursor = offset;
  88     if (limit < cursor) {
  89         // Match in the reverse direction
  90         for (i=pattern.length()-1; i>=0; --i) {
  91             UChar keyChar = pattern.charAt(i);
  92             UnicodeMatcher* subm = data->lookupMatcher(keyChar);
  93             if (subm == 0) {
  94                 if (cursor > limit &&
  95                     keyChar == text.charAt(cursor)) {
  96                     --cursor;
  97                 } else {
  98                     return U_MISMATCH;
  99                 }
 100             } else {
 101                 UMatchDegree m =
 102                     subm->matches(text, cursor, limit, incremental);
 103                 if (m != U_MATCH) {
 104                     return m;
 105                 }
 106             }
 107         }
 108         // Record the match position, but adjust for a normal
 109         // forward start, limit, and only if a prior match does not
 110         // exist -- we want the rightmost match.
 111         if (matchStart < 0) {
 112             matchStart = cursor+1;
 113             matchLimit = offset+1;
 114         }
 115     } else {
 116         for (i=0; i<pattern.length(); ++i) {
 117             if (incremental && cursor == limit) {
 118                 // We've reached the context limit without a mismatch and
 119                 // without completing our match.
 120                 return U_PARTIAL_MATCH;
 121             }
 122             UChar keyChar = pattern.charAt(i);
 123             UnicodeMatcher* subm = data->lookupMatcher(keyChar);
 124             if (subm == 0) {
 125                 // Don't need the cursor < limit check if
 126                 // incremental is TRUE (because it's done above); do need
 127                 // it otherwise.
 128                 if (cursor < limit &&
 129                     keyChar == text.charAt(cursor)) {
 130                     ++cursor;
 131                 } else {
 132                     return U_MISMATCH;
 133                 }
 134             } else {
 135                 UMatchDegree m =
 136                     subm->matches(text, cursor, limit, incremental);
 137                 if (m != U_MATCH) {
 138                     return m;
 139                 }
 140             }
 141         }
 142         // Record the match position
 143         matchStart = offset;
 144         matchLimit = cursor;
 145     }
 146
 147     offset = cursor;
 148     return U_MATCH;
 149 }
 150
 151 /**
 152  * Implement UnicodeMatcher
 153  */
 154 UnicodeString& StringMatcher::toPattern(UnicodeString& result,
 155                                         UBool escapeUnprintable) const
 156 {
 157     result.truncate(0);
 158     UnicodeString str, quoteBuf;
 159     if (segmentNumber > 0) {
 160         result.append((UChar)40); /*(*/
 161     }
 162     for (int32_t i=0; i<pattern.length(); ++i) {
 163         UChar keyChar = pattern.charAt(i);
 164         const UnicodeMatcher* m = data->lookupMatcher(keyChar);
 165         if (m == 0) {
 166             ICU_Utility::appendToRule(result, keyChar, FALSE, escapeUnprintable, quoteBuf);
 167         } else {
 168             ICU_Utility::appendToRule(result, m->toPattern(str, escapeUnprintable),
 169                          TRUE, escapeUnprintable, quoteBuf);
 170         }
 171     }
 172     if (segmentNumber > 0) {
 173         result.append((UChar)41); /*)*/
 174     }
 175     // Flush quoteBuf out to result
 176     ICU_Utility::appendToRule(result, -1,
 177                               TRUE, escapeUnprintable, quoteBuf);
 178     return result;
 179 }
 180
 181 /**
 182  * Implement UnicodeMatcher
 183  */
 184 UBool StringMatcher::matchesIndexValue(uint8_t v) const {
 185     if (pattern.length() == 0) {
 186         return TRUE;
 187     }
 188     UChar32 c = pattern.char32At(0);
 189     const UnicodeMatcher *m = data->lookupMatcher(c);
 190     return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
 191 }
 192
 193 /**
 194  * Implement UnicodeMatcher
 195  */
 196 void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const {
 197     UChar32 ch;
 198     for (int32_t i=0; i<pattern.length(); i+=U16_LENGTH(ch)) {
 199         ch = pattern.char32At(i);
 200         const UnicodeMatcher* matcher = data->lookupMatcher(ch);
 201         if (matcher == NULL) {
 202             toUnionTo.add(ch);
 203         } else {
 204             matcher->addMatchSetTo(toUnionTo);
 205         }
 206     }
 207 }
 208
 209 /**
 210  * UnicodeReplacer API
 211  */
 212 int32_t StringMatcher::replace(Replaceable& text,
 213                                int32_t start,
 214                                int32_t limit,
 215                                int32_t& /*cursor*/) {
 216
 217     int32_t outLen = 0;
 218
 219     // Copy segment with out-of-band data
 220     int32_t dest = limit;
 221     // If there was no match, that means that a quantifier
 222     // matched zero-length.  E.g., x (a)* y matched "xy".
 223     if (matchStart >= 0) {
 224         if (matchStart != matchLimit) {
 225             text.copy(matchStart, matchLimit, dest);
 226             outLen = matchLimit - matchStart;
 227         }
 228     }
 229
 230     text.handleReplaceBetween(start, limit, UnicodeString()); // delete original text
 231
 232     return outLen;
 233 }
 234
 235 /**
 236  * UnicodeReplacer API
 237  */
 238 UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule,
 239                                                 UBool /*escapeUnprintable*/) const {
 240     // assert(segmentNumber > 0);
 241     rule.truncate(0);
 242     rule.append((UChar)0x0024 /*$*/);
 243     ICU_Utility::appendNumber(rule, segmentNumber, 10, 1);
 244     return rule;
 245 }
 246
 247 /**
 248  * Remove any match info.  This must be called before performing a
 249  * set of matches with this segment.
 250  */
 251  void StringMatcher::resetMatch() {
 252     matchStart = matchLimit = -1;
 253 }
 254
 255 /**
 256  * Union the set of all characters that may output by this object
 257  * into the given set.
 258  * @param toUnionTo the set into which to union the output characters
 259  */
 260 void StringMatcher::addReplacementSetTo(UnicodeSet& /*toUnionTo*/) const {
 261     // The output of this replacer varies; it is the source text between
 262     // matchStart and matchLimit.  Since this varies depending on the
 263     // input text, we can't compute it here.  We can either do nothing
 264     // or we can add ALL characters to the set.  It's probably more useful
 265     // to do nothing.
 266 }
 267
 268 /**
 269  * Implement UnicodeFunctor
 270  */
 271 void StringMatcher::setData(const TransliterationRuleData* d) {
 272     data = d;
 273     int32_t i = 0;
 274     while (i<pattern.length()) {
 275         UChar32 c = pattern.char32At(i);
 276         UnicodeFunctor* f = data->lookup(c);
 277         if (f != NULL) {
 278             f->setData(data);
 279         }
 280         i += U16_LENGTH(c);
 281     }
 282 }
 283
 284 U_NAMESPACE_END
 285
 286 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
 287
 288 //eof