icuSources/i18n/rbt_pars.h

   1 /*
   2 **********************************************************************
   3 * Copyright (C) 1999-2011, International Business Machines Corporation
   4 * and others. All Rights Reserved.
   5 **********************************************************************
   6 *   Date        Name        Description
   7 *   11/17/99    aliu        Creation.
   8 **********************************************************************
   9 */
  10 #ifndef RBT_PARS_H
  11 #define RBT_PARS_H
  12
  13 #include "unicode/utypes.h"
  14
  15 #if !UCONFIG_NO_TRANSLITERATION
  16 #ifdef __cplusplus
  17
  18 #include "unicode/uobject.h"
  19 #include "unicode/parseerr.h"
  20 #include "unicode/unorm.h"
  21 #include "rbt.h"
  22 #include "hash.h"
  23 #include "uvector.h"
  24
  25 U_NAMESPACE_BEGIN
  26
  27 class TransliterationRuleData;
  28 class UnicodeFunctor;
  29 class ParseData;
  30 class RuleHalf;
  31 class ParsePosition;
  32 class StringMatcher;
  33
  34 class TransliteratorParser : public UMemory {
  35
  36  public:
  37
  38     /**
  39      * A Vector of TransliterationRuleData objects, one for each discrete group
  40      * of rules in the rule set
  41      */
  42     UVector dataVector;
  43
  44     /**
  45      * PUBLIC data member.
  46      * A Vector of UnicodeStrings containing all of the ID blocks in the rule set
  47      */
  48     UVector idBlockVector;
  49
  50     /**
  51      * PUBLIC data member containing the parsed compound filter, if any.
  52      */
  53     UnicodeSet* compoundFilter;
  54
  55  private:
  56
  57     /**
  58      * The current data object for which we are parsing rules
  59      */
  60     TransliterationRuleData* curData;
  61
  62     UTransDirection direction;
  63
  64     /**
  65      * Parse error information.
  66      */
  67     UParseError parseError;
  68
  69     /**
  70      * Temporary symbol table used during parsing.
  71      */
  72     ParseData* parseData;
  73
  74     /**
  75      * Temporary vector of matcher variables.  When parsing is complete, this
  76      * is copied into the array data.variables.  As with data.variables,
  77      * element 0 corresponds to character data.variablesBase.
  78      */
  79     UVector variablesVector;
  80
  81     /**
  82      * Temporary table of variable names.  When parsing is complete, this is
  83      * copied into data.variableNames.
  84      */
  85     Hashtable variableNames;
  86
  87     /**
  88      * String of standins for segments.  Used during the parsing of a single
  89      * rule.  segmentStandins.charAt(0) is the standin for "$1" and corresponds
  90      * to StringMatcher object segmentObjects.elementAt(0), etc.
  91      */
  92     UnicodeString segmentStandins;
  93
  94     /**
  95      * Vector of StringMatcher objects for segments.  Used during the
  96      * parsing of a single rule.
  97      * segmentStandins.charAt(0) is the standin for "$1" and corresponds
  98      * to StringMatcher object segmentObjects.elementAt(0), etc.
  99      */
 100     UVector segmentObjects;
 101
 102     /**
 103      * The next available stand-in for variables.  This starts at some point in
 104      * the private use area (discovered dynamically) and increments up toward
 105      * <code>variableLimit</code>.  At any point during parsing, available
 106      * variables are <code>variableNext..variableLimit-1</code>.
 107      */
 108     UChar variableNext;
 109
 110     /**
 111      * The last available stand-in for variables.  This is discovered
 112      * dynamically.  At any point during parsing, available variables are
 113      * <code>variableNext..variableLimit-1</code>.
 114      */
 115     UChar variableLimit;
 116
 117     /**
 118      * When we encounter an undefined variable, we do not immediately signal
 119      * an error, in case we are defining this variable, e.g., "$a = [a-z];".
 120      * Instead, we save the name of the undefined variable, and substitute
 121      * in the placeholder char variableLimit - 1, and decrement
 122      * variableLimit.
 123      */
 124     UnicodeString undefinedVariableName;
 125
 126     /**
 127      * The stand-in character for the 'dot' set, represented by '.' in
 128      * patterns.  This is allocated the first time it is needed, and
 129      * reused thereafter.
 130      */
 131     UChar dotStandIn;
 132
 133 public:
 134
 135     /**
 136      * Constructor.
 137      */
 138     TransliteratorParser(UErrorCode &statusReturn);
 139
 140     /**
 141      * Destructor.
 142      */
 143     ~TransliteratorParser();
 144
 145     /**
 146      * Parse the given string as a sequence of rules, separated by newline
 147      * characters ('\n'), and cause this object to implement those rules.  Any
 148      * previous rules are discarded.  Typically this method is called exactly
 149      * once after construction.
 150      *
 151      * Parse the given rules, in the given direction.  After this call
 152      * returns, query the public data members for results.  The caller
 153      * owns the 'data' and 'compoundFilter' data members after this
 154      * call returns.
 155      * @param rules      rules, separated by ';'
 156      * @param direction  either FORWARD or REVERSE.
 157      * @param pe         Struct to recieve information on position
 158      *                   of error if an error is encountered
 159      * @param ec         Output param set to success/failure code.
 160      */
 161     void parse(const UnicodeString& rules,
 162                UTransDirection direction,
 163                UParseError& pe,
 164                UErrorCode& ec);
 165
 166     /**
 167      * Return the compound filter parsed by parse().  Caller owns result.
 168      * @return the compound filter parsed by parse().
 169      */
 170     UnicodeSet* orphanCompoundFilter();
 171
 172 private:
 173
 174     /**
 175      * Return a representation of this transliterator as source rules.
 176      * @param rules      Output param to receive the rules.
 177      * @param direction  either FORWARD or REVERSE.
 178      */
 179     void parseRules(const UnicodeString& rules,
 180                     UTransDirection direction,
 181                     UErrorCode& status);
 182
 183     /**
 184      * MAIN PARSER.  Parse the next rule in the given rule string, starting
 185      * at pos.  Return the index after the last character parsed.  Do not
 186      * parse characters at or after limit.
 187      *
 188      * Important:  The character at pos must be a non-whitespace character
 189      * that is not the comment character.
 190      *
 191      * This method handles quoting, escaping, and whitespace removal.  It
 192      * parses the end-of-rule character.  It recognizes context and cursor
 193      * indicators.  Once it does a lexical breakdown of the rule at pos, it
 194      * creates a rule object and adds it to our rule list.
 195      * @param rules      Output param to receive the rules.
 196      * @param pos        the starting position.
 197      * @param limit      pointer past the last character of the rule.
 198      * @return           the index after the last character parsed.
 199      */
 200     int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
 201
 202     /**
 203      * Set the variable range to [start, end] (inclusive).
 204      * @param start    the start value of the range.
 205      * @param end      the end value of the range.
 206      */
 207     void setVariableRange(int32_t start, int32_t end, UErrorCode& status);
 208
 209     /**
 210      * Assert that the given character is NOT within the variable range.
 211      * If it is, return FALSE.  This is neccesary to ensure that the
 212      * variable range does not overlap characters used in a rule.
 213      * @param ch     the given character.
 214      * @return       True, if the given character is NOT within the variable range.
 215      */
 216     UBool checkVariableRange(UChar32 ch) const;
 217
 218     /**
 219      * Set the maximum backup to 'backup', in response to a pragma
 220      * statement.
 221      * @param backup    the new value to be set.
 222      */
 223     void pragmaMaximumBackup(int32_t backup);
 224
 225     /**
 226      * Begin normalizing all rules using the given mode, in response
 227      * to a pragma statement.
 228      * @param mode    the given mode.
 229      */
 230     void pragmaNormalizeRules(UNormalizationMode mode);
 231
 232     /**
 233      * Return true if the given rule looks like a pragma.
 234      * @param pos offset to the first non-whitespace character
 235      * of the rule.
 236      * @param limit pointer past the last character of the rule.
 237      * @return true if the given rule looks like a pragma.
 238      */
 239     static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit);
 240
 241     /**
 242      * Parse a pragma.  This method assumes resemblesPragma() has
 243      * already returned true.
 244      * @param pos offset to the first non-whitespace character
 245      * of the rule.
 246      * @param limit pointer past the last character of the rule.
 247      * @return the position index after the final ';' of the pragma,
 248      * or -1 on failure.
 249      */
 250     int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
 251
 252     /**
 253      * Called by main parser upon syntax error.  Search the rule string
 254      * for the probable end of the rule.  Of course, if the error is that
 255      * the end of rule marker is missing, then the rule end will not be found.
 256      * In any case the rule start will be correctly reported.
 257      * @param parseErrorCode error code.
 258      * @param msg error description.
 259      * @param start position of first character of current rule.
 260      * @return start position of first character of current rule.
 261      */
 262     int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start,
 263                         UErrorCode& status);
 264
 265     /**
 266      * Parse a UnicodeSet out, store it, and return the stand-in character
 267      * used to represent it.
 268      *
 269      * @param rule    the rule for UnicodeSet.
 270      * @param pos     the position in pattern at which to start parsing.
 271      * @return        the stand-in character used to represent it.
 272      */
 273     UChar parseSet(const UnicodeString& rule,
 274                    ParsePosition& pos,
 275                    UErrorCode& status);
 276
 277     /**
 278      * Generate and return a stand-in for a new UnicodeFunctor.  Store
 279      * the matcher (adopt it).
 280      * @param adopted the UnicodeFunctor to be adopted.
 281      * @return        a stand-in for a new UnicodeFunctor.
 282      */
 283     UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status);
 284
 285     /**
 286      * Return the standin for segment seg (1-based).
 287      * @param seg    the given segment.
 288      * @return       the standIn character for the given segment.
 289      */
 290     UChar getSegmentStandin(int32_t seg, UErrorCode& status);
 291
 292     /**
 293      * Set the object for segment seg (1-based).
 294      * @param seg      the given segment.
 295      * @param adopted  the StringMatcher to be adopted.
 296      */
 297     void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status);
 298
 299     /**
 300      * Return the stand-in for the dot set.  It is allocated the first
 301      * time and reused thereafter.
 302      * @return    the stand-in for the dot set.
 303      */
 304     UChar getDotStandIn(UErrorCode& status);
 305
 306     /**
 307      * Append the value of the given variable name to the given
 308      * UnicodeString.
 309      * @param name    the variable name to be appended.
 310      * @param buf     the given UnicodeString to append to.
 311      */
 312     void appendVariableDef(const UnicodeString& name,
 313                            UnicodeString& buf,
 314                            UErrorCode& status);
 315
 316     /**
 317      * Glue method to get around access restrictions in C++.
 318      */
 319     /*static Transliterator* createBasicInstance(const UnicodeString& id,
 320                                                const UnicodeString* canonID);*/
 321
 322     friend class RuleHalf;
 323
 324     // Disallowed methods; no impl.
 325     /**
 326      * Copy constructor
 327      */
 328     TransliteratorParser(const TransliteratorParser&);
 329
 330     /**
 331      * Assignment operator
 332      */
 333     TransliteratorParser& operator=(const TransliteratorParser&);
 334 };
 335
 336 U_NAMESPACE_END
 337
 338 #endif /* #ifdef __cplusplus */
 339
 340 /**
 341  * Strip/convert the following from the transliterator rules:
 342  * comments
 343  * newlines
 344  * white space at the beginning and end of a line
 345  * unescape \u notation
 346  *
 347  * The target must be equal in size as the source.
 348  * @internal
 349  */
 350 U_CAPI int32_t
 351 utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status);
 352
 353 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
 354
 355 #endif