icuSources/i18n/strmatch.cpp

   1 /*
   2 **********************************************************************
   3 *   Copyright (c) 2001-2012, International Business Machines Corporation
   4 *   and others.  All Rights Reserved.
   5 **********************************************************************
   6 *   Date        Name        Description
   7 *   07/23/01    aliu        Creation.
   8 **********************************************************************
   9 */
  10
  11 #include "unicode/utypes.h"
  12
  13 #if !UCONFIG_NO_TRANSLITERATION
  14
  15 #include "strmatch.h"
  16 #include "rbt_data.h"
  17 #include "util.h"
  18 #include "unicode/uniset.h"
  19 #include "unicode/utf16.h"
  20
  21 U_NAMESPACE_BEGIN
  22
  23 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher)
  24
  25 StringMatcher::StringMatcher(const UnicodeString& theString,
  26                              int32_t start,
  27                              int32_t limit,
  28                              int32_t segmentNum,
  29                              const TransliterationRuleData& theData) :
  30     data(&theData),
  31     segmentNumber(segmentNum),
  32     matchStart(-1),
  33     matchLimit(-1)
  34 {
  35     theString.extractBetween(start, limit, pattern);
  36 }
  37
  38 StringMatcher::StringMatcher(const StringMatcher& o) :
  39     UnicodeFunctor(o),
  40     UnicodeMatcher(o),
  41     UnicodeReplacer(o),
  42     pattern(o.pattern),
  43     data(o.data),
  44     segmentNumber(o.segmentNumber),
  45     matchStart(o.matchStart),
  46     matchLimit(o.matchLimit)
  47 {
  48 }
  49
  50 /**
  51  * Destructor
  52  */
  53 StringMatcher::~StringMatcher() {
  54 }
  55
  56 /**
  57  * Implement UnicodeFunctor
  58  */
  59 UnicodeFunctor* StringMatcher::clone() const {
  60     return new StringMatcher(*this);
  61 }
  62
  63 /**
  64  * UnicodeFunctor API.  Cast 'this' to a UnicodeMatcher* pointer
  65  * and return the pointer.
  66  */
  67 UnicodeMatcher* StringMatcher::toMatcher() const {
  68   StringMatcher  *nonconst_this = const_cast<StringMatcher *>(this);
  69   UnicodeMatcher *nonconst_base = static_cast<UnicodeMatcher *>(nonconst_this);
  70
  71   return nonconst_base;
  72 }
  73
  74 /**
  75  * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer
  76  * and return the pointer.
  77  */
  78 UnicodeReplacer* StringMatcher::toReplacer() const {
  79   StringMatcher  *nonconst_this = const_cast<StringMatcher *>(this);
  80   UnicodeReplacer *nonconst_base = static_cast<UnicodeReplacer *>(nonconst_this);
  81
  82   return nonconst_base;
  83 }
  84
  85 /**
  86  * Implement UnicodeMatcher
  87  */
  88 UMatchDegree StringMatcher::matches(const Replaceable& text,
  89                                     int32_t& offset,
  90                                     int32_t limit,
  91                                     UBool incremental) {
  92     int32_t i;
  93     int32_t cursor = offset;
  94     if (limit < cursor) {
  95         // Match in the reverse direction
  96         for (i=pattern.length()-1; i>=0; --i) {
  97             UChar keyChar = pattern.charAt(i);
  98             UnicodeMatcher* subm = data->lookupMatcher(keyChar);
  99             if (subm == 0) {
 100                 if (cursor > limit &&
 101                     keyChar == text.charAt(cursor)) {
 102                     --cursor;
 103                 } else {
 104                     return U_MISMATCH;
 105                 }
 106             } else {
 107                 UMatchDegree m =
 108                     subm->matches(text, cursor, limit, incremental);
 109                 if (m != U_MATCH) {
 110                     return m;
 111                 }
 112             }
 113         }
 114         // Record the match position, but adjust for a normal
 115         // forward start, limit, and only if a prior match does not
 116         // exist -- we want the rightmost match.
 117         if (matchStart < 0) {
 118             matchStart = cursor+1;
 119             matchLimit = offset+1;
 120         }
 121     } else {
 122         for (i=0; i<pattern.length(); ++i) {
 123             if (incremental && cursor == limit) {
 124                 // We've reached the context limit without a mismatch and
 125                 // without completing our match.
 126                 return U_PARTIAL_MATCH;
 127             }
 128             UChar keyChar = pattern.charAt(i);
 129             UnicodeMatcher* subm = data->lookupMatcher(keyChar);
 130             if (subm == 0) {
 131                 // Don't need the cursor < limit check if
 132                 // incremental is TRUE (because it's done above); do need
 133                 // it otherwise.
 134                 if (cursor < limit &&
 135                     keyChar == text.charAt(cursor)) {
 136                     ++cursor;
 137                 } else {
 138                     return U_MISMATCH;
 139                 }
 140             } else {
 141                 UMatchDegree m =
 142                     subm->matches(text, cursor, limit, incremental);
 143                 if (m != U_MATCH) {
 144                     return m;
 145                 }
 146             }
 147         }
 148         // Record the match position
 149         matchStart = offset;
 150         matchLimit = cursor;
 151     }
 152
 153     offset = cursor;
 154     return U_MATCH;
 155 }
 156
 157 /**
 158  * Implement UnicodeMatcher
 159  */
 160 UnicodeString& StringMatcher::toPattern(UnicodeString& result,
 161                                         UBool escapeUnprintable) const
 162 {
 163     result.truncate(0);
 164     UnicodeString str, quoteBuf;
 165     if (segmentNumber > 0) {
 166         result.append((UChar)40); /*(*/
 167     }
 168     for (int32_t i=0; i<pattern.length(); ++i) {
 169         UChar keyChar = pattern.charAt(i);
 170         const UnicodeMatcher* m = data->lookupMatcher(keyChar);
 171         if (m == 0) {
 172             ICU_Utility::appendToRule(result, keyChar, FALSE, escapeUnprintable, quoteBuf);
 173         } else {
 174             ICU_Utility::appendToRule(result, m->toPattern(str, escapeUnprintable),
 175                          TRUE, escapeUnprintable, quoteBuf);
 176         }
 177     }
 178     if (segmentNumber > 0) {
 179         result.append((UChar)41); /*)*/
 180     }
 181     // Flush quoteBuf out to result
 182     ICU_Utility::appendToRule(result, -1,
 183                               TRUE, escapeUnprintable, quoteBuf);
 184     return result;
 185 }
 186
 187 /**
 188  * Implement UnicodeMatcher
 189  */
 190 UBool StringMatcher::matchesIndexValue(uint8_t v) const {
 191     if (pattern.length() == 0) {
 192         return TRUE;
 193     }
 194     UChar32 c = pattern.char32At(0);
 195     const UnicodeMatcher *m = data->lookupMatcher(c);
 196     return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
 197 }
 198
 199 /**
 200  * Implement UnicodeMatcher
 201  */
 202 void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const {
 203     UChar32 ch;
 204     for (int32_t i=0; i<pattern.length(); i+=U16_LENGTH(ch)) {
 205         ch = pattern.char32At(i);
 206         const UnicodeMatcher* matcher = data->lookupMatcher(ch);
 207         if (matcher == NULL) {
 208             toUnionTo.add(ch);
 209         } else {
 210             matcher->addMatchSetTo(toUnionTo);
 211         }
 212     }
 213 }
 214
 215 /**
 216  * UnicodeReplacer API
 217  */
 218 int32_t StringMatcher::replace(Replaceable& text,
 219                                int32_t start,
 220                                int32_t limit,
 221                                int32_t& /*cursor*/) {
 222
 223     int32_t outLen = 0;
 224
 225     // Copy segment with out-of-band data
 226     int32_t dest = limit;
 227     // If there was no match, that means that a quantifier
 228     // matched zero-length.  E.g., x (a)* y matched "xy".
 229     if (matchStart >= 0) {
 230         if (matchStart != matchLimit) {
 231             text.copy(matchStart, matchLimit, dest);
 232             outLen = matchLimit - matchStart;
 233         }
 234     }
 235
 236     text.handleReplaceBetween(start, limit, UnicodeString()); // delete original text
 237
 238     return outLen;
 239 }
 240
 241 /**
 242  * UnicodeReplacer API
 243  */
 244 UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule,
 245                                                 UBool /*escapeUnprintable*/) const {
 246     // assert(segmentNumber > 0);
 247     rule.truncate(0);
 248     rule.append((UChar)0x0024 /*$*/);
 249     ICU_Utility::appendNumber(rule, segmentNumber, 10, 1);
 250     return rule;
 251 }
 252
 253 /**
 254  * Remove any match info.  This must be called before performing a
 255  * set of matches with this segment.
 256  */
 257  void StringMatcher::resetMatch() {
 258     matchStart = matchLimit = -1;
 259 }
 260
 261 /**
 262  * Union the set of all characters that may output by this object
 263  * into the given set.
 264  * @param toUnionTo the set into which to union the output characters
 265  */
 266 void StringMatcher::addReplacementSetTo(UnicodeSet& /*toUnionTo*/) const {
 267     // The output of this replacer varies; it is the source text between
 268     // matchStart and matchLimit.  Since this varies depending on the
 269     // input text, we can't compute it here.  We can either do nothing
 270     // or we can add ALL characters to the set.  It's probably more useful
 271     // to do nothing.
 272 }
 273
 274 /**
 275  * Implement UnicodeFunctor
 276  */
 277 void StringMatcher::setData(const TransliterationRuleData* d) {
 278     data = d;
 279     int32_t i = 0;
 280     while (i<pattern.length()) {
 281         UChar32 c = pattern.char32At(i);
 282         UnicodeFunctor* f = data->lookup(c);
 283         if (f != NULL) {
 284             f->setData(data);
 285         }
 286         i += U16_LENGTH(c);
 287     }
 288 }
 289
 290 U_NAMESPACE_END
 291
 292 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
 293
 294 //eof