icuSources/i18n/strmatch.cpp

   1 /*
   2 **********************************************************************
   3 *   Copyright (c) 2001-2004, International Business Machines Corporation
   4 *   and others.  All Rights Reserved.
   5 **********************************************************************
   6 *   Date        Name        Description
   7 *   07/23/01    aliu        Creation.
   8 **********************************************************************
   9 */
  10
  11 #include "unicode/utypes.h"
  12
  13 #if !UCONFIG_NO_TRANSLITERATION
  14
  15 #include "strmatch.h"
  16 #include "rbt_data.h"
  17 #include "util.h"
  18 #include "unicode/uniset.h"
  19
  20 U_NAMESPACE_BEGIN
  21
  22 static const UChar EMPTY[] = { 0 }; // empty string: ""
  23
  24 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher)
  25
  26 StringMatcher::StringMatcher(const UnicodeString& theString,
  27                              int32_t start,
  28                              int32_t limit,
  29                              int32_t segmentNum,
  30                              const TransliterationRuleData& theData) :
  31     data(&theData),
  32     segmentNumber(segmentNum),
  33     matchStart(-1),
  34     matchLimit(-1)
  35 {
  36     theString.extractBetween(start, limit, pattern);
  37 }
  38
  39 StringMatcher::StringMatcher(const StringMatcher& o) :
  40     UnicodeFunctor(o),
  41     UnicodeMatcher(o),
  42     UnicodeReplacer(o),
  43     pattern(o.pattern),
  44     data(o.data),
  45     segmentNumber(o.segmentNumber),
  46     matchStart(o.matchStart),
  47     matchLimit(o.matchLimit)
  48 {
  49 }
  50
  51 /**
  52  * Destructor
  53  */
  54 StringMatcher::~StringMatcher() {
  55 }
  56
  57 /**
  58  * Implement UnicodeFunctor
  59  */
  60 UnicodeFunctor* StringMatcher::clone() const {
  61     return new StringMatcher(*this);
  62 }
  63
  64 /**
  65  * UnicodeFunctor API.  Cast 'this' to a UnicodeMatcher* pointer
  66  * and return the pointer.
  67  */
  68 UnicodeMatcher* StringMatcher::toMatcher() const {
  69     return (UnicodeMatcher*) this;
  70 }
  71
  72 /**
  73  * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer
  74  * and return the pointer.
  75  */
  76 UnicodeReplacer* StringMatcher::toReplacer() const {
  77     return (UnicodeReplacer*) this;
  78 }
  79
  80 /**
  81  * Implement UnicodeMatcher
  82  */
  83 UMatchDegree StringMatcher::matches(const Replaceable& text,
  84                                     int32_t& offset,
  85                                     int32_t limit,
  86                                     UBool incremental) {
  87     int32_t i;
  88     int32_t cursor = offset;
  89     if (limit < cursor) {
  90         // Match in the reverse direction
  91         for (i=pattern.length()-1; i>=0; --i) {
  92             UChar keyChar = pattern.charAt(i);
  93             UnicodeMatcher* subm = data->lookupMatcher(keyChar);
  94             if (subm == 0) {
  95                 if (cursor > limit &&
  96                     keyChar == text.charAt(cursor)) {
  97                     --cursor;
  98                 } else {
  99                     return U_MISMATCH;
 100                 }
 101             } else {
 102                 UMatchDegree m =
 103                     subm->matches(text, cursor, limit, incremental);
 104                 if (m != U_MATCH) {
 105                     return m;
 106                 }
 107             }
 108         }
 109         // Record the match position, but adjust for a normal
 110         // forward start, limit, and only if a prior match does not
 111         // exist -- we want the rightmost match.
 112         if (matchStart < 0) {
 113             matchStart = cursor+1;
 114             matchLimit = offset+1;
 115         }
 116     } else {
 117         for (i=0; i<pattern.length(); ++i) {
 118             if (incremental && cursor == limit) {
 119                 // We've reached the context limit without a mismatch and
 120                 // without completing our match.
 121                 return U_PARTIAL_MATCH;
 122             }
 123             UChar keyChar = pattern.charAt(i);
 124             UnicodeMatcher* subm = data->lookupMatcher(keyChar);
 125             if (subm == 0) {
 126                 // Don't need the cursor < limit check if
 127                 // incremental is TRUE (because it's done above); do need
 128                 // it otherwise.
 129                 if (cursor < limit &&
 130                     keyChar == text.charAt(cursor)) {
 131                     ++cursor;
 132                 } else {
 133                     return U_MISMATCH;
 134                 }
 135             } else {
 136                 UMatchDegree m =
 137                     subm->matches(text, cursor, limit, incremental);
 138                 if (m != U_MATCH) {
 139                     return m;
 140                 }
 141             }
 142         }
 143         // Record the match position
 144         matchStart = offset;
 145         matchLimit = cursor;
 146     }
 147
 148     offset = cursor;
 149     return U_MATCH;
 150 }
 151
 152 /**
 153  * Implement UnicodeMatcher
 154  */
 155 UnicodeString& StringMatcher::toPattern(UnicodeString& result,
 156                                         UBool escapeUnprintable) const
 157 {
 158     result.truncate(0);
 159     UnicodeString str, quoteBuf;
 160     if (segmentNumber > 0) {
 161         result.append((UChar)40); /*(*/
 162     }
 163     for (int32_t i=0; i<pattern.length(); ++i) {
 164         UChar keyChar = pattern.charAt(i);
 165         const UnicodeMatcher* m = data->lookupMatcher(keyChar);
 166         if (m == 0) {
 167             ICU_Utility::appendToRule(result, keyChar, FALSE, escapeUnprintable, quoteBuf);
 168         } else {
 169             ICU_Utility::appendToRule(result, m->toPattern(str, escapeUnprintable),
 170                          TRUE, escapeUnprintable, quoteBuf);
 171         }
 172     }
 173     if (segmentNumber > 0) {
 174         result.append((UChar)41); /*)*/
 175     }
 176     // Flush quoteBuf out to result
 177     ICU_Utility::appendToRule(result, -1,
 178                               TRUE, escapeUnprintable, quoteBuf);
 179     return result;
 180 }
 181
 182 /**
 183  * Implement UnicodeMatcher
 184  */
 185 UBool StringMatcher::matchesIndexValue(uint8_t v) const {
 186     if (pattern.length() == 0) {
 187         return TRUE;
 188     }
 189     UChar32 c = pattern.char32At(0);
 190     const UnicodeMatcher *m = data->lookupMatcher(c);
 191     return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
 192 }
 193
 194 /**
 195  * Implement UnicodeMatcher
 196  */
 197 void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const {
 198     UChar32 ch;
 199     for (int32_t i=0; i<pattern.length(); i+=UTF_CHAR_LENGTH(ch)) {
 200         ch = pattern.char32At(i);
 201         const UnicodeMatcher* matcher = data->lookupMatcher(ch);
 202         if (matcher == NULL) {
 203             toUnionTo.add(ch);
 204         } else {
 205             matcher->addMatchSetTo(toUnionTo);
 206         }
 207     }
 208 }
 209
 210 /**
 211  * UnicodeReplacer API
 212  */
 213 int32_t StringMatcher::replace(Replaceable& text,
 214                                int32_t start,
 215                                int32_t limit,
 216                                int32_t& /*cursor*/) {
 217
 218     int32_t outLen = 0;
 219
 220     // Copy segment with out-of-band data
 221     int32_t dest = limit;
 222     // If there was no match, that means that a quantifier
 223     // matched zero-length.  E.g., x (a)* y matched "xy".
 224     if (matchStart >= 0) {
 225         if (matchStart != matchLimit) {
 226             text.copy(matchStart, matchLimit, dest);
 227             outLen = matchLimit - matchStart;
 228         }
 229     }
 230
 231     text.handleReplaceBetween(start, limit, EMPTY); // delete original text
 232
 233     return outLen;
 234 }
 235
 236 /**
 237  * UnicodeReplacer API
 238  */
 239 UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule,
 240                                                 UBool /*escapeUnprintable*/) const {
 241     // assert(segmentNumber > 0);
 242     rule.truncate(0);
 243     rule.append((UChar)0x0024 /*$*/);
 244     ICU_Utility::appendNumber(rule, segmentNumber, 10, 1);
 245     return rule;
 246 }
 247
 248 /**
 249  * Remove any match info.  This must be called before performing a
 250  * set of matches with this segment.
 251  */
 252  void StringMatcher::resetMatch() {
 253     matchStart = matchLimit = -1;
 254 }
 255
 256 /**
 257  * Union the set of all characters that may output by this object
 258  * into the given set.
 259  * @param toUnionTo the set into which to union the output characters
 260  */
 261 void StringMatcher::addReplacementSetTo(UnicodeSet& /*toUnionTo*/) const {
 262     // The output of this replacer varies; it is the source text between
 263     // matchStart and matchLimit.  Since this varies depending on the
 264     // input text, we can't compute it here.  We can either do nothing
 265     // or we can add ALL characters to the set.  It's probably more useful
 266     // to do nothing.
 267 }
 268
 269 /**
 270  * Implement UnicodeFunctor
 271  */
 272 void StringMatcher::setData(const TransliterationRuleData* d) {
 273     data = d;
 274     int32_t i = 0;
 275     while (i<pattern.length()) {
 276         UChar32 c = pattern.char32At(i);
 277         UnicodeFunctor* f = data->lookup(c);
 278         if (f != NULL) {
 279             f->setData(data);
 280         }
 281         i += UTF_CHAR_LENGTH(c);
 282     }
 283 }
 284
 285 U_NAMESPACE_END
 286
 287 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
 288
 289 //eof