icuSources/i18n/rbt_pars.h

   1 /*
   2 * Copyright (C) {1999-2003}, International Business Machines Corporation and others. All Rights Reserved.
   3 **********************************************************************
   4 *   Date        Name        Description
   5 *   11/17/99    aliu        Creation.
   6 **********************************************************************
   7 */
   8 #ifndef RBT_PARS_H
   9 #define RBT_PARS_H
  10
  11 #include "unicode/utypes.h"
  12
  13 #if !UCONFIG_NO_TRANSLITERATION
  14
  15 #include "unicode/uobject.h"
  16 #include "unicode/parseerr.h"
  17 #include "unicode/unorm.h"
  18 #include "rbt.h"
  19
  20 U_NAMESPACE_BEGIN
  21
  22 class TransliterationRuleData;
  23 class UnicodeFunctor;
  24 class ParseData;
  25 class RuleHalf;
  26 class ParsePosition;
  27 class UVector;
  28 class StringMatcher;
  29
  30 class TransliteratorParser : public UMemory {
  31
  32  public:
  33
  34     /**
  35      * PUBLIC data member containing the parsed data object, or null if
  36      * there were no rules.
  37      */
  38     TransliterationRuleData* data;
  39
  40     /**
  41      * PUBLIC data member.
  42      * The block of ::IDs, both at the top and at the bottom.
  43      * Inserted into these may be additional rules at the
  44      * idSplitPoint.
  45      */
  46     UnicodeString idBlock;
  47
  48     /**
  49      * PUBLIC data member.
  50      * In a compound RBT, the index at which the RBT rules are
  51      * inserted into the ID block.  Index 0 means before any IDs
  52      * in the block.  Index idBlock.length() means after all IDs
  53      * in the block.  Index is a string index.
  54      */
  55     int32_t idSplitPoint;
  56
  57     /**
  58      * PUBLIC data member containing the parsed compound filter, if any.
  59      */
  60     UnicodeSet* compoundFilter;
  61
  62  private:
  63
  64     // The number of rules parsed.  This tells us if there were
  65     // any actual transliterator rules, or if there were just ::ID
  66     // block IDs.
  67     int32_t ruleCount;
  68
  69     UTransDirection direction;
  70
  71     /**
  72      * We use a single error code during parsing.  Rather than pass it
  73      * through each API, we keep it here.
  74      */
  75     UErrorCode status;
  76
  77     /**
  78      * Parse error information.
  79      */
  80     UParseError parseError;
  81
  82     /**
  83      * Temporary symbol table used during parsing.
  84      */
  85     ParseData* parseData;
  86
  87     /**
  88      * Temporary vector of matcher variables.  When parsing is complete, this
  89      * is copied into the array data.variables.  As with data.variables,
  90      * element 0 corresponds to character data.variablesBase.
  91      */
  92     UVector* variablesVector;
  93
  94     /**
  95      * String of standins for segments.  Used during the parsing of a single
  96      * rule.  segmentStandins.charAt(0) is the standin for "$1" and corresponds
  97      * to StringMatcher object segmentObjects.elementAt(0), etc.
  98      */
  99     UnicodeString segmentStandins;
 100
 101     /**
 102      * Vector of StringMatcher objects for segments.  Used during the
 103      * parsing of a single rule.
 104      * segmentStandins.charAt(0) is the standin for "$1" and corresponds
 105      * to StringMatcher object segmentObjects.elementAt(0), etc.
 106      */
 107     UVector* segmentObjects;
 108
 109     /**
 110      * The next available stand-in for variables.  This starts at some point in
 111      * the private use area (discovered dynamically) and increments up toward
 112      * <code>variableLimit</code>.  At any point during parsing, available
 113      * variables are <code>variableNext..variableLimit-1</code>.
 114      */
 115     UChar variableNext;
 116
 117     /**
 118      * The last available stand-in for variables.  This is discovered
 119      * dynamically.  At any point during parsing, available variables are
 120      * <code>variableNext..variableLimit-1</code>.
 121      */
 122     UChar variableLimit;
 123
 124     /**
 125      * When we encounter an undefined variable, we do not immediately signal
 126      * an error, in case we are defining this variable, e.g., "$a = [a-z];".
 127      * Instead, we save the name of the undefined variable, and substitute
 128      * in the placeholder char variableLimit - 1, and decrement
 129      * variableLimit.
 130      */
 131     UnicodeString undefinedVariableName;
 132
 133     /**
 134      * The stand-in character for the 'dot' set, represented by '.' in
 135      * patterns.  This is allocated the first time it is needed, and
 136      * reused thereafter.
 137      */
 138     UChar dotStandIn;
 139
 140 public:
 141
 142     /**
 143      * Constructor.
 144      */
 145     TransliteratorParser();
 146
 147     /**
 148      * Destructor.
 149      */
 150     ~TransliteratorParser();
 151
 152     /**
 153      * Parse the given string as a sequence of rules, separated by newline
 154      * characters ('\n'), and cause this object to implement those rules.  Any
 155      * previous rules are discarded.  Typically this method is called exactly
 156      * once after construction.
 157      *
 158      * Parse the given rules, in the given direction.  After this call
 159      * returns, query the public data members for results.  The caller
 160      * owns the 'data' and 'compoundFilter' data members after this
 161      * call returns.
 162      * @param rules      rules, separated by ';'
 163      * @param direction  either FORWARD or REVERSE.
 164      * @param pe         Struct to recieve information on position
 165      *                   of error if an error is encountered
 166      * @param ec         Output param set to success/failure code.
 167      */
 168     void parse(const UnicodeString& rules,
 169                UTransDirection direction,
 170                UParseError& pe,
 171                UErrorCode& ec);
 172
 173     /**
 174      * Return the compound filter parsed by parse().  Caller owns result.
 175      * @return the compound filter parsed by parse().
 176      */
 177     UnicodeSet* orphanCompoundFilter();
 178
 179     /**
 180      * Return the data object parsed by parse().  Caller owns result.
 181      * @return the data object parsed by parse().
 182      */
 183     TransliterationRuleData* orphanData();
 184
 185 private:
 186
 187     /**
 188      * Return a representation of this transliterator as source rules.
 189      * @param rules      Output param to receive the rules.
 190      * @param direction  either FORWARD or REVERSE.
 191      */
 192     void parseRules(const UnicodeString& rules,
 193                     UTransDirection direction);
 194
 195     /**
 196      * MAIN PARSER.  Parse the next rule in the given rule string, starting
 197      * at pos.  Return the index after the last character parsed.  Do not
 198      * parse characters at or after limit.
 199      *
 200      * Important:  The character at pos must be a non-whitespace character
 201      * that is not the comment character.
 202      *
 203      * This method handles quoting, escaping, and whitespace removal.  It
 204      * parses the end-of-rule character.  It recognizes context and cursor
 205      * indicators.  Once it does a lexical breakdown of the rule at pos, it
 206      * creates a rule object and adds it to our rule list.
 207      * @param rules      Output param to receive the rules.
 208      * @param pos        the starting position.
 209      * @param limit      pointer past the last character of the rule.
 210      * @return           the index after the last character parsed.
 211      */
 212     int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit);
 213
 214     /**
 215      * Set the variable range to [start, end] (inclusive).
 216      * @param start    the start value of the range.
 217      * @param end      the end value of the range.
 218      */
 219     void setVariableRange(int32_t start, int32_t end);
 220
 221     /**
 222      * Assert that the given character is NOT within the variable range.
 223      * If it is, return FALSE.  This is neccesary to ensure that the
 224      * variable range does not overlap characters used in a rule.
 225      * @param ch     the given character.
 226      * @return       True, if the given character is NOT within the variable range.
 227      */
 228     UBool checkVariableRange(UChar32 ch) const;
 229
 230     /**
 231      * Set the maximum backup to 'backup', in response to a pragma
 232      * statement.
 233      * @param backup    the new value to be set.
 234      */
 235     void pragmaMaximumBackup(int32_t backup);
 236
 237     /**
 238      * Begin normalizing all rules using the given mode, in response
 239      * to a pragma statement.
 240      * @param mode    the given mode.
 241      */
 242     void pragmaNormalizeRules(UNormalizationMode mode);
 243
 244     /**
 245      * Return true if the given rule looks like a pragma.
 246      * @param pos offset to the first non-whitespace character
 247      * of the rule.
 248      * @param limit pointer past the last character of the rule.
 249      * @return true if the given rule looks like a pragma.
 250      */
 251     static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit);
 252
 253     /**
 254      * Parse a pragma.  This method assumes resemblesPragma() has
 255      * already returned true.
 256      * @param pos offset to the first non-whitespace character
 257      * of the rule.
 258      * @param limit pointer past the last character of the rule.
 259      * @return the position index after the final ';' of the pragma,
 260      * or -1 on failure.
 261      */
 262     int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit);
 263
 264     /**
 265      * Called by main parser upon syntax error.  Search the rule string
 266      * for the probable end of the rule.  Of course, if the error is that
 267      * the end of rule marker is missing, then the rule end will not be found.
 268      * In any case the rule start will be correctly reported.
 269      * @param parseErrorCode error code.
 270      * @param msg error description.
 271      * @param start position of first character of current rule.
 272      * @return start position of first character of current rule.
 273      */
 274     int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start);
 275
 276     /**
 277      * Parse a UnicodeSet out, store it, and return the stand-in character
 278      * used to represent it.
 279      *
 280      * @param rule    the rule for UnicodeSet.
 281      * @param pos     the position in pattern at which to start parsing.
 282      * @return        the stand-in character used to represent it.
 283      */
 284     UChar parseSet(const UnicodeString& rule,
 285                    ParsePosition& pos);
 286
 287     /**
 288      * Generate and return a stand-in for a new UnicodeFunctor.  Store
 289      * the matcher (adopt it).
 290      * @param adopted the UnicodeFunctor to be adopted.
 291      * @return        a stand-in for a new UnicodeFunctor.
 292      */
 293     UChar generateStandInFor(UnicodeFunctor* adopted);
 294
 295     /**
 296      * Return the standin for segment seg (1-based).
 297      * @param seg    the given segment.
 298      * @return       the standIn character for the given segment.
 299      */
 300     UChar getSegmentStandin(int32_t seg);
 301
 302     /**
 303      * Set the object for segment seg (1-based).
 304      * @param seg      the given segment.
 305      * @param adopted  the StringMatcher to be adopted.
 306      */
 307     void setSegmentObject(int32_t seg, StringMatcher* adopted);
 308
 309     /**
 310      * Return the stand-in for the dot set.  It is allocated the first
 311      * time and reused thereafter.
 312      * @return    the stand-in for the dot set.
 313      */
 314     UChar getDotStandIn();
 315
 316     /**
 317      * Append the value of the given variable name to the given
 318      * UnicodeString.
 319      * @param name    the variable name to be appended.
 320      * @param buf     the given UnicodeString to append to.
 321      */
 322     void appendVariableDef(const UnicodeString& name,
 323                            UnicodeString& buf);
 324
 325     /**
 326      * Glue method to get around access restrictions in C++.
 327      */
 328     static Transliterator* createBasicInstance(const UnicodeString& id,
 329                                                const UnicodeString* canonID);
 330
 331     friend class RuleHalf;
 332
 333     // Disallowed methods; no impl.
 334     /**
 335      * Copy constructor
 336      */
 337     TransliteratorParser(const TransliteratorParser&);
 338
 339     /**
 340      * Assignment operator
 341      */
 342     TransliteratorParser& operator=(const TransliteratorParser&);
 343 };
 344
 345 U_NAMESPACE_END
 346
 347 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
 348
 349 #endif