icuSources/i18n/strmatch.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 **********************************************************************
   5 *   Copyright (c) 2001-2012, International Business Machines Corporation
   6 *   and others.  All Rights Reserved.
   7 **********************************************************************
   8 *   Date        Name        Description
   9 *   07/23/01    aliu        Creation.
  10 **********************************************************************
  11 */
  12
  13 #include "unicode/utypes.h"
  14
  15 #if !UCONFIG_NO_TRANSLITERATION
  16
  17 #include "strmatch.h"
  18 #include "rbt_data.h"
  19 #include "util.h"
  20 #include "unicode/uniset.h"
  21 #include "unicode/utf16.h"
  22
  23 U_NAMESPACE_BEGIN
  24
  25 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher)
  26
  27 StringMatcher::StringMatcher(const UnicodeString& theString,
  28                              int32_t start,
  29                              int32_t limit,
  30                              int32_t segmentNum,
  31                              const TransliterationRuleData& theData) :
  32     data(&theData),
  33     segmentNumber(segmentNum),
  34     matchStart(-1),
  35     matchLimit(-1)
  36 {
  37     theString.extractBetween(start, limit, pattern);
  38 }
  39
  40 StringMatcher::StringMatcher(const StringMatcher& o) :
  41     UnicodeFunctor(o),
  42     UnicodeMatcher(o),
  43     UnicodeReplacer(o),
  44     pattern(o.pattern),
  45     data(o.data),
  46     segmentNumber(o.segmentNumber),
  47     matchStart(o.matchStart),
  48     matchLimit(o.matchLimit)
  49 {
  50 }
  51
  52 /**
  53  * Destructor
  54  */
  55 StringMatcher::~StringMatcher() {
  56 }
  57
  58 /**
  59  * Implement UnicodeFunctor
  60  */
  61 UnicodeFunctor* StringMatcher::clone() const {
  62     return new StringMatcher(*this);
  63 }
  64
  65 /**
  66  * UnicodeFunctor API.  Cast 'this' to a UnicodeMatcher* pointer
  67  * and return the pointer.
  68  */
  69 UnicodeMatcher* StringMatcher::toMatcher() const {
  70   StringMatcher  *nonconst_this = const_cast<StringMatcher *>(this);
  71   UnicodeMatcher *nonconst_base = static_cast<UnicodeMatcher *>(nonconst_this);
  72
  73   return nonconst_base;
  74 }
  75
  76 /**
  77  * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer
  78  * and return the pointer.
  79  */
  80 UnicodeReplacer* StringMatcher::toReplacer() const {
  81   StringMatcher  *nonconst_this = const_cast<StringMatcher *>(this);
  82   UnicodeReplacer *nonconst_base = static_cast<UnicodeReplacer *>(nonconst_this);
  83
  84   return nonconst_base;
  85 }
  86
  87 /**
  88  * Implement UnicodeMatcher
  89  */
  90 UMatchDegree StringMatcher::matches(const Replaceable& text,
  91                                     int32_t& offset,
  92                                     int32_t limit,
  93                                     UBool incremental) {
  94     int32_t i;
  95     int32_t cursor = offset;
  96     if (limit < cursor) {
  97         // Match in the reverse direction
  98         for (i=pattern.length()-1; i>=0; --i) {
  99             UChar keyChar = pattern.charAt(i);
 100             UnicodeMatcher* subm = data->lookupMatcher(keyChar);
 101             if (subm == 0) {
 102                 if (cursor > limit &&
 103                     keyChar == text.charAt(cursor)) {
 104                     --cursor;
 105                 } else {
 106                     return U_MISMATCH;
 107                 }
 108             } else {
 109                 UMatchDegree m =
 110                     subm->matches(text, cursor, limit, incremental);
 111                 if (m != U_MATCH) {
 112                     return m;
 113                 }
 114             }
 115         }
 116         // Record the match position, but adjust for a normal
 117         // forward start, limit, and only if a prior match does not
 118         // exist -- we want the rightmost match.
 119         if (matchStart < 0) {
 120             matchStart = cursor+1;
 121             matchLimit = offset+1;
 122         }
 123     } else {
 124         for (i=0; i<pattern.length(); ++i) {
 125             if (incremental && cursor == limit) {
 126                 // We've reached the context limit without a mismatch and
 127                 // without completing our match.
 128                 return U_PARTIAL_MATCH;
 129             }
 130             UChar keyChar = pattern.charAt(i);
 131             UnicodeMatcher* subm = data->lookupMatcher(keyChar);
 132             if (subm == 0) {
 133                 // Don't need the cursor < limit check if
 134                 // incremental is TRUE (because it's done above); do need
 135                 // it otherwise.
 136                 if (cursor < limit &&
 137                     keyChar == text.charAt(cursor)) {
 138                     ++cursor;
 139                 } else {
 140                     return U_MISMATCH;
 141                 }
 142             } else {
 143                 UMatchDegree m =
 144                     subm->matches(text, cursor, limit, incremental);
 145                 if (m != U_MATCH) {
 146                     return m;
 147                 }
 148             }
 149         }
 150         // Record the match position
 151         matchStart = offset;
 152         matchLimit = cursor;
 153     }
 154
 155     offset = cursor;
 156     return U_MATCH;
 157 }
 158
 159 /**
 160  * Implement UnicodeMatcher
 161  */
 162 UnicodeString& StringMatcher::toPattern(UnicodeString& result,
 163                                         UBool escapeUnprintable) const
 164 {
 165     result.truncate(0);
 166     UnicodeString str, quoteBuf;
 167     if (segmentNumber > 0) {
 168         result.append((UChar)40); /*(*/
 169     }
 170     for (int32_t i=0; i<pattern.length(); ++i) {
 171         UChar keyChar = pattern.charAt(i);
 172         const UnicodeMatcher* m = data->lookupMatcher(keyChar);
 173         if (m == 0) {
 174             ICU_Utility::appendToRule(result, keyChar, FALSE, escapeUnprintable, quoteBuf);
 175         } else {
 176             ICU_Utility::appendToRule(result, m->toPattern(str, escapeUnprintable),
 177                          TRUE, escapeUnprintable, quoteBuf);
 178         }
 179     }
 180     if (segmentNumber > 0) {
 181         result.append((UChar)41); /*)*/
 182     }
 183     // Flush quoteBuf out to result
 184     ICU_Utility::appendToRule(result, -1,
 185                               TRUE, escapeUnprintable, quoteBuf);
 186     return result;
 187 }
 188
 189 /**
 190  * Implement UnicodeMatcher
 191  */
 192 UBool StringMatcher::matchesIndexValue(uint8_t v) const {
 193     if (pattern.length() == 0) {
 194         return TRUE;
 195     }
 196     UChar32 c = pattern.char32At(0);
 197     const UnicodeMatcher *m = data->lookupMatcher(c);
 198     return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
 199 }
 200
 201 /**
 202  * Implement UnicodeMatcher
 203  */
 204 void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const {
 205     UChar32 ch;
 206     for (int32_t i=0; i<pattern.length(); i+=U16_LENGTH(ch)) {
 207         ch = pattern.char32At(i);
 208         const UnicodeMatcher* matcher = data->lookupMatcher(ch);
 209         if (matcher == NULL) {
 210             toUnionTo.add(ch);
 211         } else {
 212             matcher->addMatchSetTo(toUnionTo);
 213         }
 214     }
 215 }
 216
 217 /**
 218  * UnicodeReplacer API
 219  */
 220 int32_t StringMatcher::replace(Replaceable& text,
 221                                int32_t start,
 222                                int32_t limit,
 223                                int32_t& /*cursor*/) {
 224
 225     int32_t outLen = 0;
 226
 227     // Copy segment with out-of-band data
 228     int32_t dest = limit;
 229     // If there was no match, that means that a quantifier
 230     // matched zero-length.  E.g., x (a)* y matched "xy".
 231     if (matchStart >= 0) {
 232         if (matchStart != matchLimit) {
 233             text.copy(matchStart, matchLimit, dest);
 234             outLen = matchLimit - matchStart;
 235         }
 236     }
 237
 238     text.handleReplaceBetween(start, limit, UnicodeString()); // delete original text
 239
 240     return outLen;
 241 }
 242
 243 /**
 244  * UnicodeReplacer API
 245  */
 246 UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule,
 247                                                 UBool /*escapeUnprintable*/) const {
 248     // assert(segmentNumber > 0);
 249     rule.truncate(0);
 250     rule.append((UChar)0x0024 /*$*/);
 251     ICU_Utility::appendNumber(rule, segmentNumber, 10, 1);
 252     return rule;
 253 }
 254
 255 /**
 256  * Remove any match info.  This must be called before performing a
 257  * set of matches with this segment.
 258  */
 259  void StringMatcher::resetMatch() {
 260     matchStart = matchLimit = -1;
 261 }
 262
 263 /**
 264  * Union the set of all characters that may output by this object
 265  * into the given set.
 266  * @param toUnionTo the set into which to union the output characters
 267  */
 268 void StringMatcher::addReplacementSetTo(UnicodeSet& /*toUnionTo*/) const {
 269     // The output of this replacer varies; it is the source text between
 270     // matchStart and matchLimit.  Since this varies depending on the
 271     // input text, we can't compute it here.  We can either do nothing
 272     // or we can add ALL characters to the set.  It's probably more useful
 273     // to do nothing.
 274 }
 275
 276 /**
 277  * Implement UnicodeFunctor
 278  */
 279 void StringMatcher::setData(const TransliterationRuleData* d) {
 280     data = d;
 281     int32_t i = 0;
 282     while (i<pattern.length()) {
 283         UChar32 c = pattern.char32At(i);
 284         UnicodeFunctor* f = data->lookup(c);
 285         if (f != NULL) {
 286             f->setData(data);
 287         }
 288         i += U16_LENGTH(c);
 289     }
 290 }
 291
 292 U_NAMESPACE_END
 293
 294 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
 295
 296 //eof