icuSources/i18n/rbt_rule.h

   1 /*
   2 * Copyright (C) {1999-2001}, International Business Machines Corporation and others. All Rights Reserved.
   3 **********************************************************************
   4 *   Date        Name        Description
   5 *   11/17/99    aliu        Creation.
   6 **********************************************************************
   7 */
   8 #ifndef RBT_RULE_H
   9 #define RBT_RULE_H
  10
  11 #include "unicode/utypes.h"
  12
  13 #if !UCONFIG_NO_TRANSLITERATION
  14
  15 #include "unicode/uobject.h"
  16 #include "unicode/unistr.h"
  17 #include "unicode/utrans.h"
  18 #include "unicode/unimatch.h"
  19
  20 U_NAMESPACE_BEGIN
  21
  22 class Replaceable;
  23 class TransliterationRuleData;
  24 class StringMatcher;
  25 class UnicodeFunctor;
  26
  27 /**
  28  * A transliteration rule used by
  29  * <code>RuleBasedTransliterator</code>.
  30  * <code>TransliterationRule</code> is an immutable object.
  31  *
  32  * <p>A rule consists of an input pattern and an output string.  When
  33  * the input pattern is matched, the output string is emitted.  The
  34  * input pattern consists of zero or more characters which are matched
  35  * exactly (the key) and optional context.  Context must match if it
  36  * is specified.  Context may be specified before the key, after the
  37  * key, or both.  The key, preceding context, and following context
  38  * may contain variables.  Variables represent a set of Unicode
  39  * characters, such as the letters <i>a</i> through <i>z</i>.
  40  * Variables are detected by looking up each character in a supplied
  41  * variable list to see if it has been so defined.
  42  *
  43  * <p>A rule may contain segments in its input string and segment
  44  * references in its output string.  A segment is a substring of the
  45  * input pattern, indicated by an offset and limit.  The segment may
  46  * be in the preceding or following context.  It may not span a
  47  * context boundary.  A segment reference is a special character in
  48  * the output string that causes a segment of the input string (not
  49  * the input pattern) to be copied to the output string.  The range of
  50  * special characters that represent segment references is defined by
  51  * RuleBasedTransliterator.Data.
  52  *
  53  * @author Alan Liu
  54  */
  55 class TransliterationRule : public UMemory {
  56
  57 private:
  58
  59     // TODO Eliminate the pattern and keyLength data members.  They
  60     // are used only by masks() and getIndexValue() which are called
  61     // only during build time, not during run-time.  Perhaps these
  62     // methods and pattern/keyLength can be isolated into a separate
  63     // object.
  64
  65     /**
  66      * The match that must occur before the key, or null if there is no
  67      * preceding context.
  68      */
  69     StringMatcher *anteContext;
  70
  71     /**
  72      * The matcher object for the key.  If null, then the key is empty.
  73      */
  74     StringMatcher *key;
  75
  76     /**
  77      * The match that must occur after the key, or null if there is no
  78      * following context.
  79      */
  80     StringMatcher *postContext;
  81
  82     /**
  83      * The object that performs the replacement if the key,
  84      * anteContext, and postContext are matched.  Never null.
  85      */
  86     UnicodeFunctor* output;
  87
  88     /**
  89      * The string that must be matched, consisting of the anteContext, key,
  90      * and postContext, concatenated together, in that order.  Some components
  91      * may be empty (zero length).
  92      * @see anteContextLength
  93      * @see keyLength
  94      */
  95     UnicodeString pattern;
  96
  97     /**
  98      * An array of matcher objects corresponding to the input pattern
  99      * segments.  If there are no segments this is null.  N.B. This is
 100      * a UnicodeMatcher for generality, but in practice it is always a
 101      * StringMatcher.  In the future we may generalize this, but for
 102      * now we sometimes cast down to StringMatcher.
 103      *
 104      * The array is owned, but the pointers within it are not.
 105      */
 106     UnicodeFunctor** segments;
 107
 108     /**
 109      * The number of elements in segments[] or zero if segments is NULL.
 110      */
 111     int32_t segmentsCount;
 112
 113     /**
 114      * The length of the string that must match before the key.  If
 115      * zero, then there is no matching requirement before the key.
 116      * Substring [0,anteContextLength) of pattern is the anteContext.
 117      */
 118     int32_t anteContextLength;
 119
 120     /**
 121      * The length of the key.  Substring [anteContextLength,
 122      * anteContextLength + keyLength) is the key.
 123
 124      */
 125     int32_t keyLength;
 126
 127     /**
 128      * Miscellaneous attributes.
 129      */
 130     int8_t flags;
 131
 132     /**
 133      * Flag attributes.
 134      */
 135     enum {
 136         ANCHOR_START = 1,
 137         ANCHOR_END   = 2
 138     };
 139
 140     /**
 141      * An alias pointer to the data for this rule.  The data provides
 142      * lookup services for matchers and segments.
 143      */
 144     const TransliterationRuleData* data;
 145
 146 public:
 147
 148     /**
 149      * Construct a new rule with the given input, output text, and other
 150      * attributes.  A cursor position may be specified for the output text.
 151      * @param input          input string, including key and optional ante and
 152      *                       post context.
 153      * @param anteContextPos offset into input to end of ante context, or -1 if
 154      *                       none.  Must be <= input.length() if not -1.
 155      * @param postContextPos offset into input to start of post context, or -1
 156      *                       if none.  Must be <= input.length() if not -1, and must be >=
 157      *                       anteContextPos.
 158      * @param outputStr      output string.
 159      * @param cursorPosition offset into output at which cursor is located, or -1 if
 160      *                       none.  If less than zero, then the cursor is placed after the
 161      *                       <code>output</code>; that is, -1 is equivalent to
 162      *                       <code>output.length()</code>.  If greater than
 163      *                       <code>output.length()</code> then an exception is thrown.
 164      * @param cursorOffset   an offset to be added to cursorPos to position the
 165      *                       cursor either in the ante context, if < 0, or in the post context, if >
 166      *                       0.  For example, the rule "abc{def} > | @@@ xyz;" changes "def" to
 167      *                       "xyz" and moves the cursor to before "a".  It would have a cursorOffset
 168      *                       of -3.
 169      * @param segs           array of UnicodeMatcher corresponding to input pattern
 170      *                       segments, or null if there are none.  The array itself is adopted,
 171      *                       but the pointers within it are not.
 172      * @param segsCount      number of elements in segs[].
 173      * @param anchorStart    TRUE if the the rule is anchored on the left to
 174      *                       the context start.
 175      * @param anchorEnd      TRUE if the rule is anchored on the right to the
 176      *                       context limit.
 177      * @param data           the rule data.
 178      * @param status         Output parameter filled in with success or failure status.
 179      */
 180     TransliterationRule(const UnicodeString& input,
 181                         int32_t anteContextPos, int32_t postContextPos,
 182                         const UnicodeString& outputStr,
 183                         int32_t cursorPosition, int32_t cursorOffset,
 184                         UnicodeFunctor** segs,
 185                         int32_t segsCount,
 186                         UBool anchorStart, UBool anchorEnd,
 187                         const TransliterationRuleData* data,
 188                         UErrorCode& status);
 189
 190     /**
 191      * Copy constructor.
 192      * @param other    the object to be copied.
 193      */
 194     TransliterationRule(TransliterationRule& other);
 195
 196     /**
 197      * Destructor.
 198      */
 199     virtual ~TransliterationRule();
 200
 201     /**
 202      * Change the data object that this rule belongs to.  Used
 203      * internally by the TransliterationRuleData copy constructor.
 204      * @param data    the new data value to be set.
 205      */
 206     void setData(const TransliterationRuleData* data);
 207
 208     /**
 209      * Return the preceding context length.  This method is needed to
 210      * support the <code>Transliterator</code> method
 211      * <code>getMaximumContextLength()</code>.  Internally, this is
 212      * implemented as the anteContextLength, optionally plus one if
 213      * there is a start anchor.  The one character anchor gap is
 214      * needed to make repeated incremental transliteration with
 215      * anchors work.
 216      * @return    the preceding context length.
 217      */
 218     virtual int32_t getContextLength(void) const;
 219
 220     /**
 221      * Internal method.  Returns 8-bit index value for this rule.
 222      * This is the low byte of the first character of the key,
 223      * unless the first character of the key is a set.  If it's a
 224      * set, or otherwise can match multiple keys, the index value is -1.
 225      * @return    8-bit index value for this rule.
 226      */
 227     int16_t getIndexValue() const;
 228
 229     /**
 230      * Internal method.  Returns true if this rule matches the given
 231      * index value.  The index value is an 8-bit integer, 0..255,
 232      * representing the low byte of the first character of the key.
 233      * It matches this rule if it matches the first character of the
 234      * key, or if the first character of the key is a set, and the set
 235      * contains any character with a low byte equal to the index
 236      * value.  If the rule contains only ante context, as in foo)>bar,
 237      * then it will match any key.
 238      * @param v    the given index value.
 239      * @return     true if this rule matches the given index value.
 240      */
 241     UBool matchesIndexValue(uint8_t v) const;
 242
 243     /**
 244      * Return true if this rule masks another rule.  If r1 masks r2 then
 245      * r1 matches any input string that r2 matches.  If r1 masks r2 and r2 masks
 246      * r1 then r1 == r2.  Examples: "a>x" masks "ab>y".  "a>x" masks "a[b]>y".
 247      * "[c]a>x" masks "[dc]a>y".
 248      * @param r2  the given rule to be compared with.
 249      * @return    true if this rule masks 'r2'
 250      */
 251     virtual UBool masks(const TransliterationRule& r2) const;
 252
 253     /**
 254      * Attempt a match and replacement at the given position.  Return
 255      * the degree of match between this rule and the given text.  The
 256      * degree of match may be mismatch, a partial match, or a full
 257      * match.  A mismatch means at least one character of the text
 258      * does not match the context or key.  A partial match means some
 259      * context and key characters match, but the text is not long
 260      * enough to match all of them.  A full match means all context
 261      * and key characters match.
 262      *
 263      * If a full match is obtained, perform a replacement, update pos,
 264      * and return U_MATCH.  Otherwise both text and pos are unchanged.
 265      *
 266      * @param text the text
 267      * @param pos the position indices
 268      * @param incremental if TRUE, test for partial matches that may
 269      * be completed by additional text inserted at pos.limit.
 270      * @return one of <code>U_MISMATCH</code>,
 271      * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>.  If
 272      * incremental is FALSE then U_PARTIAL_MATCH will not be returned.
 273      */
 274     UMatchDegree matchAndReplace(Replaceable& text,
 275                                  UTransPosition& pos,
 276                                  UBool incremental) const;
 277
 278     /**
 279      * Create a rule string that represents this rule object.  Append
 280      * it to the given string.
 281      */
 282     virtual UnicodeString& toRule(UnicodeString& pat,
 283                                   UBool escapeUnprintable) const;
 284
 285     /**
 286      * Union the set of all characters that may be modified by this rule
 287      * into the given set.
 288      */
 289     void addSourceSetTo(UnicodeSet& toUnionTo) const;
 290
 291     /**
 292      * Union the set of all characters that may be emitted by this rule
 293      * into the given set.
 294      */
 295     void addTargetSetTo(UnicodeSet& toUnionTo) const;
 296
 297  private:
 298
 299     friend class StringMatcher;
 300
 301     TransliterationRule &operator=(const TransliterationRule &other); // forbid copying of this class
 302 };
 303
 304 U_NAMESPACE_END
 305
 306 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
 307
 308 #endif