icuSources/common/rbbiscan.cpp

   1
   2 //
   3 //  file:  rbbiscan.cpp
   4 //
   5 //  Copyright (C) 2002-2003, International Business Machines Corporation and others.
   6 //  All Rights Reserved.
   7 //
   8 //  This file contains the Rule Based Break Iterator Rule Builder functions for
   9 //   scanning the rules and assembling a parse tree.  This is the first phase
  10 //   of compiling the rules.
  11 //
  12 //  The overall of the rules is managed by class RBBIRuleBuilder, which will
  13 //  create and use an instance of this class as part of the process.
  14 //
  15
  16 #include "unicode/utypes.h"
  17
  18 #if !UCONFIG_NO_BREAK_ITERATION
  19
  20 #include "unicode/unistr.h"
  21 #include "unicode/uniset.h"
  22 #include "unicode/uchar.h"
  23 #include "unicode/uchriter.h"
  24 #include "unicode/parsepos.h"
  25 #include "unicode/parseerr.h"
  26 #include "uprops.h"
  27 #include "cmemory.h"
  28 #include "cstring.h"
  29
  30 #include "rbbirpt.h"   // Contains state table for the rbbi rules parser.
  31                        //   generated by a Perl script.
  32 #include "rbbirb.h"
  33 #include "rbbinode.h"
  34 #include "rbbiscan.h"
  35
  36 #include "uassert.h"
  37
  38
  39 //----------------------------------------------------------------------------------------
  40 //
  41 // Unicode Set init strings for each of the character classes needed for parsing a rule file.
  42 //               (Initialized with hex values for portability to EBCDIC based machines.
  43 //                Really ugly, but there's no good way to avoid it.)
  44 //
  45 //              The sets are referred to by name in the rbbirpt.txt, which is the
  46 //              source form of the state transition table for the RBBI rule parser.
  47 //
  48 //----------------------------------------------------------------------------------------
  49 static const UChar gRuleSet_rule_char_pattern[]       = {
  50  //   [    ^      [    \     p     {      Z     }     \     u    0      0    2      0
  51     0x5b, 0x5e, 0x5b, 0x5c, 0x70, 0x7b, 0x5a, 0x7d, 0x5c, 0x75, 0x30, 0x30, 0x32, 0x30,
  52  //   -    \      u    0     0     7      f     ]     -     [    \      p
  53     0x2d, 0x5c, 0x75, 0x30, 0x30, 0x37, 0x66, 0x5d, 0x2d, 0x5b, 0x5c, 0x70,
  54  //   {     L     }    ]     -     [      \     p     {     N    }      ]     ]
  55     0x7b, 0x4c, 0x7d, 0x5d, 0x2d, 0x5b, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0x5d, 0};
  56
  57 static const UChar gRuleSet_name_char_pattern[]       = {
  58 //    [    _      \    p     {     L      }     \     p     {    N      }     ]
  59     0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0};
  60
  61 static const UChar gRuleSet_digit_char_pattern[] = {
  62 //    [    0      -    9     ]
  63     0x5b, 0x30, 0x2d, 0x39, 0x5d, 0};
  64
  65 static const UChar gRuleSet_name_start_char_pattern[] = {
  66 //    [    _      \    p     {     L      }     ]
  67     0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5d, 0 };
  68
  69 static const UChar kAny[] = {0x61, 0x6e, 0x79, 0x00};  // "any"
  70
  71
  72 U_CDECL_BEGIN
  73 static void  U_EXPORT2 U_CALLCONV RBBISetTable_deleter(void *p) {
  74     RBBISetTableEl *px = (RBBISetTableEl *)p;
  75     delete px->key;
  76     // Note:  px->val is owned by the linked list "fSetsListHead" in scanner.
  77     //        Don't delete the value nodes here.
  78     uprv_free(px);
  79 }
  80 U_CDECL_END
  81
  82 U_NAMESPACE_BEGIN
  83
  84 //----------------------------------------------------------------------------------------
  85 //
  86 //  Constructor.
  87 //
  88 //----------------------------------------------------------------------------------------
  89 RBBIRuleScanner::RBBIRuleScanner(RBBIRuleBuilder *rb)
  90 {
  91     fRB                 = rb;
  92     fStackPtr           = 0;
  93     fStack[fStackPtr]   = 0;
  94     fNodeStackPtr       = 0;
  95     fRuleNum            = 0;
  96     fNodeStack[0]       = NULL;
  97
  98     fRuleSets[kRuleSet_rule_char-128]       = NULL;
  99     fRuleSets[kRuleSet_white_space-128]     = NULL;
 100     fRuleSets[kRuleSet_name_char-128]       = NULL;
 101     fRuleSets[kRuleSet_name_start_char-128] = NULL;
 102     fRuleSets[kRuleSet_digit_char-128]      = NULL;
 103     fSymbolTable                            = NULL;
 104     fSetTable                               = NULL;
 105
 106     fScanIndex = 0;
 107     fNextIndex = 0;
 108
 109     fReverseRule        = FALSE;
 110     fLookAheadRule      = FALSE;
 111
 112     fLineNum    = 1;
 113     fCharNum    = 0;
 114     fQuoteMode  = FALSE;
 115
 116     if (U_FAILURE(*rb->fStatus)) {
 117         return;
 118     }
 119
 120     //
 121     //  Set up the constant Unicode Sets.
 122     //     Note:  These could be made static, lazily initialized, and shared among
 123     //            all instances of RBBIRuleScanners.  BUT this is quite a bit simpler,
 124     //            and the time to build these few sets should be small compared to a
 125     //            full break iterator build.
 126     fRuleSets[kRuleSet_rule_char-128]       = new UnicodeSet(gRuleSet_rule_char_pattern,       *rb->fStatus);
 127     fRuleSets[kRuleSet_white_space-128]     = (UnicodeSet*) uprv_openRuleWhiteSpaceSet(rb->fStatus);
 128     fRuleSets[kRuleSet_name_char-128]       = new UnicodeSet(gRuleSet_name_char_pattern,       *rb->fStatus);
 129     fRuleSets[kRuleSet_name_start_char-128] = new UnicodeSet(gRuleSet_name_start_char_pattern, *rb->fStatus);
 130     fRuleSets[kRuleSet_digit_char-128]      = new UnicodeSet(gRuleSet_digit_char_pattern,      *rb->fStatus);
 131     if (*rb->fStatus == U_ILLEGAL_ARGUMENT_ERROR) {
 132         // This case happens if ICU's data is missing.  UnicodeSet tries to look up property
 133         //   names from the init string, can't find them, and claims an illegal arguement.
 134         //   Change the error so that the actual problem will be clearer to users.
 135         *rb->fStatus = U_BRK_INIT_ERROR;
 136     }
 137     if (U_FAILURE(*rb->fStatus)) {
 138         return;
 139     }
 140
 141     fSymbolTable = new RBBISymbolTable(this, rb->fRules, *rb->fStatus);
 142     fSetTable    = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, rb->fStatus);
 143     uhash_setValueDeleter(fSetTable, RBBISetTable_deleter);
 144 }
 145
 146
 147
 148 //----------------------------------------------------------------------------------------
 149 //
 150 //  Destructor
 151 //
 152 //----------------------------------------------------------------------------------------
 153 RBBIRuleScanner::~RBBIRuleScanner() {
 154     delete fRuleSets[kRuleSet_rule_char-128];
 155     delete fRuleSets[kRuleSet_white_space-128];
 156     delete fRuleSets[kRuleSet_name_char-128];
 157     delete fRuleSets[kRuleSet_name_start_char-128];
 158     delete fRuleSets[kRuleSet_digit_char-128];
 159
 160     delete fSymbolTable;
 161     if (fSetTable != NULL) {
 162          uhash_close(fSetTable);
 163          fSetTable = NULL;
 164
 165     }
 166
 167
 168     // Node Stack.
 169     //   Normally has one entry, which is the entire parse tree for the rules.
 170     //   If errors occured, there may be additional subtrees left on the stack.
 171     while (fNodeStackPtr > 0) {
 172         delete fNodeStack[fNodeStackPtr];
 173         fNodeStackPtr--;
 174     }
 175
 176 }
 177
 178 //----------------------------------------------------------------------------------------
 179 //
 180 //  doParseAction        Do some action during rule parsing.
 181 //                       Called by the parse state machine.
 182 //                       Actions build the parse tree and Unicode Sets,
 183 //                       and maintain the parse stack for nested expressions.
 184 //
 185 //                       TODO:  unify EParseAction and RBBI_RuleParseAction enum types.
 186 //                              They represent exactly the same thing.  They're separate
 187 //                              only to work around enum forward declaration restrictions
 188 //                              in some compilers, while at the same time avoiding multiple
 189 //                              definitions problems.  I'm sure that there's a better way.
 190 //
 191 //----------------------------------------------------------------------------------------
 192 UBool RBBIRuleScanner::doParseActions(EParseAction action)
 193 {
 194     RBBINode *n       = NULL;
 195
 196     UBool   returnVal = TRUE;
 197
 198     switch ((RBBI_RuleParseAction)action) {
 199
 200     case doExprStart:
 201         pushNewNode(RBBINode::opStart);
 202         fRuleNum++;
 203         break;
 204
 205
 206     case doExprOrOperator:
 207         {
 208             fixOpStack(RBBINode::precOpCat);
 209             RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];
 210             RBBINode  *orNode      = pushNewNode(RBBINode::opOr);
 211             orNode->fLeftChild     = operandNode;
 212             operandNode->fParent   = orNode;
 213         }
 214         break;
 215
 216     case doExprCatOperator:
 217         // concatenation operator.
 218         // For the implicit concatenation of adjacent terms in an expression that are
 219         //   not separated by any other operator.  Action is invoked between the
 220         //   actions for the two terms.
 221         {
 222             fixOpStack(RBBINode::precOpCat);
 223             RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];
 224             RBBINode  *catNode     = pushNewNode(RBBINode::opCat);
 225             catNode->fLeftChild    = operandNode;
 226             operandNode->fParent   = catNode;
 227         }
 228         break;
 229
 230     case doLParen:
 231         // Open Paren.
 232         //   The openParen node is a dummy operation type with a low precedence,
 233         //     which has the affect of ensuring that any real binary op that
 234         //     follows within the parens binds more tightly to the operands than
 235         //     stuff outside of the parens.
 236         pushNewNode(RBBINode::opLParen);
 237         break;
 238
 239     case doExprRParen:
 240         fixOpStack(RBBINode::precLParen);
 241         break;
 242
 243     case doNOP:
 244         break;
 245
 246     case doStartAssign:
 247         // We've just scanned "$variable = "
 248         // The top of the node stack has the $variable ref node.
 249
 250         // Save the start position of the RHS text in the StartExpression node
 251         //   that precedes the $variableReference node on the stack.
 252         //   This will eventually be used when saving the full $variable replacement
 253         //   text as a string.
 254         n = fNodeStack[fNodeStackPtr-1];
 255         n->fFirstPos = fNextIndex;              // move past the '='
 256
 257         // Push a new start-of-expression node; needed to keep parse of the
 258         //   RHS expression happy.
 259         pushNewNode(RBBINode::opStart);
 260         break;
 261
 262
 263
 264
 265     case doEndAssign:
 266         {
 267             // We have reached the end of an assignement statement.
 268             //   Current scan char is the ';' that terminates the assignment.
 269
 270             // Terminate expression, leaves expression parse tree rooted in TOS node.
 271             fixOpStack(RBBINode::precStart);
 272
 273             RBBINode *startExprNode  = fNodeStack[fNodeStackPtr-2];
 274             RBBINode *varRefNode     = fNodeStack[fNodeStackPtr-1];
 275             RBBINode *RHSExprNode    = fNodeStack[fNodeStackPtr];
 276
 277             // Save original text of right side of assignment, excluding the terminating ';'
 278             //  in the root of the node for the right-hand-side expression.
 279             RHSExprNode->fFirstPos = startExprNode->fFirstPos;
 280             RHSExprNode->fLastPos  = fScanIndex;
 281             fRB->fRules.extractBetween(RHSExprNode->fFirstPos, RHSExprNode->fLastPos, RHSExprNode->fText);
 282
 283             // Expression parse tree becomes l. child of the $variable reference node.
 284             varRefNode->fLeftChild = RHSExprNode;
 285             RHSExprNode->fParent   = varRefNode;
 286
 287             // Make a symbol table entry for the $variableRef node.
 288             fSymbolTable->addEntry(varRefNode->fText, varRefNode, *fRB->fStatus);
 289
 290             // Clean up the stack.
 291             delete startExprNode;
 292             fNodeStackPtr-=3;
 293             break;
 294         }
 295
 296     case doEndOfRule:
 297         {
 298         fixOpStack(RBBINode::precStart);      // Terminate expression, leaves expression
 299         if (U_FAILURE(*fRB->fStatus)) {       //   parse tree rooted in TOS node.
 300             break;
 301         }
 302         if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rtree")) {printNodeStack("end of rule");}
 303         U_ASSERT(fNodeStackPtr == 1);
 304
 305         // If this rule includes a look-ahead '/', add a endMark node to the
 306         //   expression tree.
 307         if (fLookAheadRule) {
 308             RBBINode  *thisRule       = fNodeStack[fNodeStackPtr];
 309             RBBINode  *endNode        = pushNewNode(RBBINode::endMark);
 310             RBBINode  *catNode        = pushNewNode(RBBINode::opCat);
 311             fNodeStackPtr -= 2;
 312             catNode->fLeftChild       = thisRule;
 313             catNode->fRightChild      = endNode;
 314             fNodeStack[fNodeStackPtr] = catNode;
 315             endNode->fVal             = fRuleNum;
 316             endNode->fLookAheadEnd    = TRUE;
 317         }
 318
 319         // All rule expressions are ORed together.
 320         // The ';' that terminates an expression really just functions as a '|' with
 321         //   a low operator prededence.
 322         //
 323         // Forward and reverse rules are collected separately.  Or this rule into
 324         //  the appropriate group of them.
 325         //
 326         RBBINode **destRules = (fReverseRule? &fRB->fReverseTree : &fRB->fForwardTree);
 327
 328         if (*destRules != NULL) {
 329             // This is not the first rule encounted.
 330             // OR previous stuff  (from *destRules)
 331             // with the current rule expression (on the Node Stack)
 332             //  with the resulting OR expression going to *destRules
 333             //
 334             RBBINode  *thisRule    = fNodeStack[fNodeStackPtr];
 335             RBBINode  *prevRules   = *destRules;
 336             RBBINode  *orNode      = pushNewNode(RBBINode::opOr);
 337             orNode->fLeftChild     = prevRules;
 338             prevRules->fParent     = orNode;
 339             orNode->fRightChild    = thisRule;
 340             thisRule->fParent      = orNode;
 341             *destRules             = orNode;
 342         }
 343         else
 344         {
 345             // This is the first rule encountered (for this direction).
 346             // Just move its parse tree from the stack to *destRules.
 347             *destRules = fNodeStack[fNodeStackPtr];
 348         }
 349         fReverseRule   = FALSE;   // in preparation for the next rule.
 350         fLookAheadRule = FALSE;
 351         fNodeStackPtr  = 0;
 352         }
 353         break;
 354
 355
 356     case doRuleError:
 357         error(U_BRK_RULE_SYNTAX);
 358         returnVal = FALSE;
 359         break;
 360
 361
 362     case doVariableNameExpectedErr:
 363         error(U_BRK_RULE_SYNTAX);
 364         break;
 365
 366
 367     //
 368     //  Unary operands  + ? *
 369     //    These all appear after the operand to which they apply.
 370     //    When we hit one, the operand (may be a whole sub expression)
 371     //    will be on the top of the stack.
 372     //    Unary Operator becomes TOS, with the old TOS as its one child.
 373     case doUnaryOpPlus:
 374         {
 375             RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];
 376             RBBINode  *plusNode    = pushNewNode(RBBINode::opPlus);
 377             plusNode->fLeftChild   = operandNode;
 378             operandNode->fParent   = plusNode;
 379         }
 380         break;
 381
 382     case doUnaryOpQuestion:
 383         {
 384             RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];
 385             RBBINode  *qNode       = pushNewNode(RBBINode::opQuestion);
 386             qNode->fLeftChild      = operandNode;
 387             operandNode->fParent   = qNode;
 388         }
 389         break;
 390
 391     case doUnaryOpStar:
 392         {
 393             RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];
 394             RBBINode  *starNode    = pushNewNode(RBBINode::opStar);
 395             starNode->fLeftChild   = operandNode;
 396             operandNode->fParent   = starNode;
 397         }
 398         break;
 399
 400     case doRuleChar:
 401         // A "Rule Character" is any single character that is a literal part
 402         // of the regular expression.  Like a, b and c in the expression "(abc*) | [:L:]"
 403         // These are pretty uncommon in break rules; the terms are more commonly
 404         //  sets.  To keep things uniform, treat these characters like as
 405         // sets that just happen to contain only one character.
 406         {
 407             n = pushNewNode(RBBINode::setRef);
 408             findSetFor(fC.fChar, n);
 409             n->fFirstPos = fScanIndex;
 410             n->fLastPos  = fNextIndex;
 411             fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
 412             break;
 413         }
 414
 415     case doDotAny:
 416         // scanned a ".", meaning match any single character.
 417         {
 418             n = pushNewNode(RBBINode::setRef);
 419             findSetFor(kAny, n);
 420             n->fFirstPos = fScanIndex;
 421             n->fLastPos  = fNextIndex;
 422             fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
 423             break;
 424         }
 425         break;
 426
 427     case doSlash:
 428         // Scanned a '/', which identifies a look-ahead break position in a rule.
 429         n = pushNewNode(RBBINode::lookAhead);
 430         n->fVal      = fRuleNum;
 431         n->fFirstPos = fScanIndex;
 432         n->fLastPos  = fNextIndex;
 433         fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
 434         fLookAheadRule = TRUE;
 435         break;
 436
 437
 438     case doStartTagValue:
 439         // Scanned a '{', the opening delimiter for a tag value within a rule.
 440         n = pushNewNode(RBBINode::tag);
 441         n->fVal      = 0;
 442         n->fFirstPos = fScanIndex;
 443         n->fLastPos  = fNextIndex;
 444         break;
 445
 446     case doTagDigit:
 447         // Just scanned a decimal digit that's part of a tag value
 448         {
 449             n = fNodeStack[fNodeStackPtr];
 450             uint32_t v = u_charDigitValue(fC.fChar);
 451             U_ASSERT(v < 10);
 452             n->fVal = n->fVal*10 + v;
 453             break;
 454         }
 455
 456     case doTagValue:
 457         n = fNodeStack[fNodeStackPtr];
 458         n->fLastPos = fNextIndex;
 459         fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
 460         break;
 461
 462
 463
 464     case doReverseDir:
 465         fReverseRule = TRUE;
 466         break;
 467
 468     case doStartVariableName:
 469         n = pushNewNode(RBBINode::varRef);
 470         if (U_FAILURE(*fRB->fStatus)) {
 471             break;
 472         }
 473         n->fFirstPos = fScanIndex;
 474         break;
 475
 476     case doEndVariableName:
 477         n = fNodeStack[fNodeStackPtr];
 478         if (n==NULL || n->fType != RBBINode::varRef) {
 479             error(U_BRK_INTERNAL_ERROR);
 480             break;
 481         }
 482         n->fLastPos = fScanIndex;
 483         fRB->fRules.extractBetween(n->fFirstPos+1, n->fLastPos, n->fText);
 484         // Look the newly scanned name up in the symbol table
 485         //   If there's an entry, set the l. child of the var ref to the replacement expression.
 486         //   (We also pass through here when scanning assignments, but no harm is done, other
 487         //    than a slight wasted effort that seems hard to avoid.  Lookup will be null)
 488         n->fLeftChild = fSymbolTable->lookupNode(n->fText);
 489         break;
 490
 491     case doCheckVarDef:
 492         n = fNodeStack[fNodeStackPtr];
 493         if (n->fLeftChild == NULL) {
 494             error(U_BRK_UNDEFINED_VARIABLE);
 495             returnVal = FALSE;
 496         }
 497         break;
 498
 499     case doExprFinished:
 500         break;
 501
 502     case doRuleErrorAssignExpr:
 503         error(U_BRK_ASSIGN_ERROR);
 504         returnVal = FALSE;
 505         break;
 506
 507     case doExit:
 508         returnVal = FALSE;
 509         break;
 510
 511     case doScanUnicodeSet:
 512         scanSet();
 513         break;
 514
 515     default:
 516         error(U_BRK_INTERNAL_ERROR);
 517         returnVal = FALSE;
 518         break;
 519     }
 520     return returnVal;
 521 }
 522
 523
 524
 525
 526 //----------------------------------------------------------------------------------------
 527 //
 528 //  Error         Report a rule parse error.
 529 //                Only report it if no previous error has been recorded.
 530 //
 531 //----------------------------------------------------------------------------------------
 532 void RBBIRuleScanner::error(UErrorCode e) {
 533     if (U_SUCCESS(*fRB->fStatus)) {
 534         *fRB->fStatus = e;
 535         fRB->fParseError->line  = fLineNum;
 536         fRB->fParseError->offset = fCharNum;
 537         fRB->fParseError->preContext[0] = 0;
 538         fRB->fParseError->preContext[0] = 0;
 539     }
 540 }
 541
 542
 543
 544
 545 //----------------------------------------------------------------------------------------
 546 //
 547 //  fixOpStack   The parse stack holds partially assembled chunks of the parse tree.
 548 //               An entry on the stack may be as small as a single setRef node,
 549 //               or as large as the parse tree
 550 //               for an entire expression (this will be the one item left on the stack
 551 //               when the parsing of an RBBI rule completes.
 552 //
 553 //               This function is called when a binary operator is encountered.
 554 //               It looks back up the stack for operators that are not yet associated
 555 //               with a right operand, and if the precedence of the stacked operator >=
 556 //               the precedence of the current operator, binds the operand left,
 557 //               to the previously encountered operator.
 558 //
 559 //----------------------------------------------------------------------------------------
 560 void RBBIRuleScanner::fixOpStack(RBBINode::OpPrecedence p) {
 561     RBBINode *n;
 562     // printNodeStack("entering fixOpStack()");
 563     for (;;) {
 564         n = fNodeStack[fNodeStackPtr-1];   // an operator node
 565         if (n->fPrecedence == 0) {
 566             RBBIDebugPrintf("RBBIRuleScanner::fixOpStack, bad operator node\n");
 567             error(U_BRK_INTERNAL_ERROR);
 568             return;
 569         }
 570
 571         if (n->fPrecedence < p || n->fPrecedence <= RBBINode::precLParen) {
 572             // The most recent operand goes with the current operator,
 573             //   not with the previously stacked one.
 574             break;
 575         }
 576             // Stack operator is a binary op  ( '|' or concatenation)
 577             //   TOS operand becomes right child of this operator.
 578             //   Resulting subexpression becomes the TOS operand.
 579             n->fRightChild = fNodeStack[fNodeStackPtr];
 580             fNodeStack[fNodeStackPtr]->fParent = n;
 581             fNodeStackPtr--;
 582         // printNodeStack("looping in fixOpStack()   ");
 583     }
 584
 585     if (p <= RBBINode::precLParen) {
 586         // Scan is at a right paren or end of expression.
 587         //  The scanned item must match the stack, or else there was an error.
 588         //  Discard the left paren (or start expr) node from the stack,
 589             //  leaving the completed (sub)expression as TOS.
 590             if (n->fPrecedence != p) {
 591                 // Right paren encountered matched start of expression node, or
 592                 // end of expression matched with a left paren node.
 593                 error(U_BRK_MISMATCHED_PAREN);
 594             }
 595             fNodeStack[fNodeStackPtr-1] = fNodeStack[fNodeStackPtr];
 596             fNodeStackPtr--;
 597             // Delete the now-discarded LParen or Start node.
 598             delete n;
 599     }
 600     // printNodeStack("leaving fixOpStack()");
 601 }
 602
 603
 604
 605
 606 //----------------------------------------------------------------------------------------
 607 //
 608 //   findSetFor    given a UnicodeString,
 609 //                  - find the corresponding Unicode Set  (uset node)
 610 //                         (create one if necessary)
 611 //                  - Set fLeftChild of the caller's node (should be a setRef node)
 612 //                         to the uset node
 613 //                 Maintain a hash table of uset nodes, so the same one is always used
 614 //                    for the same string.
 615 //                 If a "to adopt" set is provided and we haven't seen this key before,
 616 //                    add the provided set to the hash table.
 617 //                 If the string is one (32 bit) char in length, the set contains
 618 //                    just one element which is the char in question.
 619 //                 If the string is "any", return a set containing all chars.
 620 //
 621 //----------------------------------------------------------------------------------------
 622 void RBBIRuleScanner::findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt) {
 623
 624     RBBISetTableEl   *el;
 625
 626     // First check whether we've already cached a set for this string.
 627     // If so, just use the cached set in the new node.
 628     //   delete any set provided by the caller, since we own it.
 629     el = (RBBISetTableEl *)uhash_get(fSetTable, &s);
 630     if (el != NULL) {
 631         delete setToAdopt;
 632         node->fLeftChild = el->val;
 633         U_ASSERT(node->fLeftChild->fType == RBBINode::uset);
 634         return;
 635     }
 636
 637     // Haven't seen this set before.
 638     // If the caller didn't provide us with a prebuilt set,
 639     //   create a new UnicodeSet now.
 640     if (setToAdopt == NULL) {
 641         if (s.compare(kAny, -1) == 0) {
 642             setToAdopt = new UnicodeSet(0x000000, 0x10ffff);
 643         } else {
 644             UChar32 c;
 645             c = s.char32At(0);
 646             setToAdopt = new UnicodeSet(c, c);
 647         }
 648     }
 649
 650     //
 651     // Make a new uset node to refer to this UnicodeSet
 652     // This new uset node becomes the child of the caller's setReference node.
 653     //
 654     RBBINode *usetNode    = new RBBINode(RBBINode::uset);
 655     usetNode->fInputSet   = setToAdopt;
 656     usetNode->fParent     = node;
 657     node->fLeftChild      = usetNode;
 658     usetNode->fText = s;
 659
 660
 661     //
 662     // Add the new uset node to the list of all uset nodes.
 663     //
 664     fRB->fUSetNodes->addElement(usetNode, *fRB->fStatus);
 665
 666
 667     //
 668     // Add the new set to the set hash table.
 669     //
 670     el      = (RBBISetTableEl *)uprv_malloc(sizeof(RBBISetTableEl));
 671     UnicodeString *tkey = new UnicodeString(s);
 672     if (tkey == NULL || el == NULL || setToAdopt == NULL) {
 673         error(U_MEMORY_ALLOCATION_ERROR);
 674         return;
 675     }
 676     el->key = tkey;
 677     el->val = usetNode;
 678     uhash_put(fSetTable, el->key, el, fRB->fStatus);
 679
 680     return;
 681 }
 682
 683
 684
 685 //
 686 //  Assorted Unicode character constants.
 687 //     Numeric because there is no portable way to enter them as literals.
 688 //     (Think EBCDIC).
 689 //
 690 static const UChar      chCR        = 0x0d;      // New lines, for terminating comments.
 691 static const UChar      chLF        = 0x0a;
 692 static const UChar      chNEL       = 0x85;      //    NEL newline variant
 693 static const UChar      chLS        = 0x2028;    //    Unicode Line Separator
 694 static const UChar      chApos      = 0x27;      //  single quote, for quoted chars.
 695 static const UChar      chPound     = 0x23;      // '#', introduces a comment.
 696 static const UChar      chBackSlash = 0x5c;      // '\'  introduces a char escape
 697 static const UChar      chLParen    = 0x28;
 698 static const UChar      chRParen    = 0x29;
 699
 700
 701 //----------------------------------------------------------------------------------------
 702 //
 703 //  stripRules    Return a rules string without unnecessary
 704 //                characters.
 705 //
 706 //----------------------------------------------------------------------------------------
 707 UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) {
 708     UnicodeString strippedRules;
 709     int rulesLength = rules.length();
 710     for (int idx = 0; idx < rulesLength; ) {
 711         UChar ch = rules[idx++];
 712         if (ch == chPound) {
 713             while (idx < rulesLength
 714                 && ch != chCR && ch != chLF && ch != chNEL)
 715             {
 716                 ch = rules[idx++];
 717             }
 718         }
 719         if (!u_isISOControl(ch)) {
 720             strippedRules.append(ch);
 721         }
 722     }
 723     // strippedRules = strippedRules.unescape();
 724     return strippedRules;
 725 }
 726
 727
 728 //----------------------------------------------------------------------------------------
 729 //
 730 //  nextCharLL    Low Level Next Char from rule input source.
 731 //                Get a char from the input character iterator,
 732 //                keep track of input position for error reporting.
 733 //
 734 //----------------------------------------------------------------------------------------
 735 UChar32  RBBIRuleScanner::nextCharLL() {
 736     UChar32  ch;
 737
 738     if (fNextIndex >= fRB->fRules.length()) {
 739         return (UChar32)-1;
 740     }
 741     ch         = fRB->fRules.char32At(fNextIndex);
 742     fNextIndex = fRB->fRules.moveIndex32(fNextIndex, 1);
 743
 744     if (ch == chCR ||
 745         ch == chNEL ||
 746         ch == chLS   ||
 747         ch == chLF && fLastChar != chCR) {
 748         // Character is starting a new line.  Bump up the line number, and
 749         //  reset the column to 0.
 750         fLineNum++;
 751         fCharNum=0;
 752         if (fQuoteMode) {
 753             error(U_BRK_NEW_LINE_IN_QUOTED_STRING);
 754             fQuoteMode = FALSE;
 755         }
 756     }
 757     else {
 758         // Character is not starting a new line.  Except in the case of a
 759         //   LF following a CR, increment the column position.
 760         if (ch != chLF) {
 761             fCharNum++;
 762         }
 763     }
 764     fLastChar = ch;
 765     return ch;
 766 }
 767
 768
 769 //---------------------------------------------------------------------------------
 770 //
 771 //   nextChar     for rules scanning.  At this level, we handle stripping
 772 //                out comments and processing backslash character escapes.
 773 //                The rest of the rules grammar is handled at the next level up.
 774 //
 775 //---------------------------------------------------------------------------------
 776 void RBBIRuleScanner::nextChar(RBBIRuleChar &c) {
 777
 778     // Unicode Character constants needed for the processing done by nextChar(),
 779     //   in hex because literals wont work on EBCDIC machines.
 780
 781     fScanIndex = fNextIndex;
 782     c.fChar    = nextCharLL();
 783     c.fEscaped = FALSE;
 784
 785     //
 786     //  check for '' sequence.
 787     //  These are recognized in all contexts, whether in quoted text or not.
 788     //
 789     if (c.fChar == chApos) {
 790         if (fRB->fRules.char32At(fNextIndex) == chApos) {
 791             c.fChar    = nextCharLL();        // get nextChar officially so character counts
 792             c.fEscaped = TRUE;                //   stay correct.
 793         }
 794         else
 795         {
 796             // Single quote, by itself.
 797             //   Toggle quoting mode.
 798             //   Return either '('  or ')', because quotes cause a grouping of the quoted text.
 799             fQuoteMode = !fQuoteMode;
 800             if (fQuoteMode == TRUE) {
 801                 c.fChar = chLParen;
 802             } else {
 803                 c.fChar = chRParen;
 804             }
 805             c.fEscaped = FALSE;      // The paren that we return is not escaped.
 806             return;
 807         }
 808     }
 809
 810     if (fQuoteMode) {
 811         c.fEscaped = TRUE;
 812     }
 813     else
 814     {
 815         // We are not in a 'quoted region' of the source.
 816         //
 817         if (c.fChar == chPound) {
 818             // Start of a comment.  Consume the rest of it.
 819             //  The new-line char that terminates the comment is always returned.
 820             //  It will be treated as white-space, and serves to break up anything
 821             //    that might otherwise incorrectly clump together with a comment in
 822             //    the middle (a variable name, for example.)
 823             for (;;) {
 824                 c.fChar = nextCharLL();
 825                 if (c.fChar == (UChar32)-1 ||  // EOF
 826                     c.fChar == chCR     ||
 827                     c.fChar == chLF     ||
 828                     c.fChar == chNEL    ||
 829                     c.fChar == chLS)       {break;}
 830             }
 831         }
 832         if (c.fChar == (UChar32)-1) {
 833             return;
 834         }
 835
 836         //
 837         //  check for backslash escaped characters.
 838         //  Use UnicodeString::unescapeAt() to handle them.
 839         //
 840         if (c.fChar == chBackSlash) {
 841             c.fEscaped = TRUE;
 842             int32_t startX = fNextIndex;
 843             c.fChar = fRB->fRules.unescapeAt(fNextIndex);
 844             if (fNextIndex == startX) {
 845                 error(U_BRK_HEX_DIGITS_EXPECTED);
 846             }
 847             fCharNum += fNextIndex-startX;
 848         }
 849     }
 850     // putc(c.fChar, stdout);
 851 }
 852
 853 //---------------------------------------------------------------------------------
 854 //
 855 //  Parse RBBI rules.   The state machine for rules parsing is here.
 856 //                      The state tables are hand-written in the file TODO.txt,
 857 //                      and converted to the form used here by a perl
 858 //                      script rbbicst.pl
 859 //
 860 //---------------------------------------------------------------------------------
 861 void RBBIRuleScanner::parse() {
 862     uint16_t                state;
 863     const RBBIRuleTableEl  *tableEl;
 864
 865     if (U_FAILURE(*fRB->fStatus)) {
 866         return;
 867     }
 868
 869     state = 1;
 870     nextChar(fC);
 871     //
 872     // Main loop for the rule parsing state machine.
 873     //   Runs once per state transition.
 874     //   Each time through optionally performs, depending on the state table,
 875     //      - an advance to the the next input char
 876     //      - an action to be performed.
 877     //      - pushing or popping a state to/from the local state return stack.
 878     //
 879     for (;;) {
 880         //  Bail out if anything has gone wrong.
 881         //  RBBI rule file parsing stops on the first error encountered.
 882         if (U_FAILURE(*fRB->fStatus)) {
 883             break;
 884         }
 885
 886         // Quit if state == 0.  This is the normal way to exit the state machine.
 887         //
 888         if (state == 0) {
 889             break;
 890         }
 891
 892         // Find the state table element that matches the input char from the rule, or the
 893         //    class of the input character.  Start with the first table row for this
 894         //    state, then linearly scan forward until we find a row that matches the
 895         //    character.  The last row for each state always matches all characters, so
 896         //    the search will stop there, if not before.
 897         //
 898         tableEl = &gRuleParseStateTable[state];
 899         if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) {
 900             RBBIDebugPrintf("char, line, col = (\'%c\', %d, %d)    state=%s ",
 901                 fC.fChar, fLineNum, fCharNum, RBBIRuleStateNames[state]);
 902         }
 903
 904         for (;;) {
 905             if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf(".");}
 906             if (tableEl->fCharClass < 127 && fC.fEscaped == FALSE &&   tableEl->fCharClass == fC.fChar) {
 907                 // Table row specified an individual character, not a set, and
 908                 //   the input character is not escaped, and
 909                 //   the input character matched it.
 910                 break;
 911             }
 912             if (tableEl->fCharClass == 255) {
 913                 // Table row specified default, match anything character class.
 914                 break;
 915             }
 916             if (tableEl->fCharClass == 254 && fC.fEscaped)  {
 917                 // Table row specified "escaped" and the char was escaped.
 918                 break;
 919             }
 920             if (tableEl->fCharClass == 253 && fC.fEscaped &&
 921                 (fC.fChar == 0x50 || fC.fChar == 0x70 ))  {
 922                 // Table row specified "escaped P" and the char is either 'p' or 'P'.
 923                 break;
 924             }
 925             if (tableEl->fCharClass == 252 && fC.fChar == (UChar32)-1)  {
 926                 // Table row specified eof and we hit eof on the input.
 927                 break;
 928             }
 929
 930             if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class &&
 931                 fC.fEscaped == FALSE &&                                      //   char is not escaped &&
 932                 fC.fChar != (UChar32)-1) {                                   //   char is not EOF
 933                 UnicodeSet *uniset = fRuleSets[tableEl->fCharClass-128];
 934                 if (uniset->contains(fC.fChar)) {
 935                     // Table row specified a character class, or set of characters,
 936                     //   and the current char matches it.
 937                     break;
 938                 }
 939             }
 940
 941             // No match on this row, advance to the next  row for this state,
 942             tableEl++;
 943         }
 944         if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf("\n");}
 945
 946         //
 947         // We've found the row of the state table that matches the current input
 948         //   character from the rules string.
 949         // Perform any action specified  by this row in the state table.
 950         if (doParseActions((EParseAction)tableEl->fAction) == FALSE) {
 951             // Break out of the state machine loop if the
 952             //   the action signalled some kind of error, or
 953             //   the action was to exit, occurs on normal end-of-rules-input.
 954             break;
 955         }
 956
 957         if (tableEl->fPushState != 0) {
 958             fStackPtr++;
 959             if (fStackPtr >= kStackSize) {
 960                 error(U_BRK_INTERNAL_ERROR);
 961                 RBBIDebugPrintf("RBBIRuleScanner::parse() - state stack overflow.\n");
 962                 fStackPtr--;
 963             }
 964             fStack[fStackPtr] = tableEl->fPushState;
 965         }
 966
 967         if (tableEl->fNextChar) {
 968             nextChar(fC);
 969         }
 970
 971         // Get the next state from the table entry, or from the
 972         //   state stack if the next state was specified as "pop".
 973         if (tableEl->fNextState != 255) {
 974             state = tableEl->fNextState;
 975         } else {
 976             state = fStack[fStackPtr];
 977             fStackPtr--;
 978             if (fStackPtr < 0) {
 979                 error(U_BRK_INTERNAL_ERROR);
 980                 RBBIDebugPrintf("RBBIRuleScanner::parse() - state stack underflow.\n");
 981                 fStackPtr++;
 982             }
 983         }
 984
 985     }
 986
 987     //
 988     // If there were NO user specified reverse rules, set up the equivalent of ".*;"
 989     //
 990     if (fRB->fReverseTree == NULL) {
 991         fRB->fReverseTree  = pushNewNode(RBBINode::opStar);
 992         RBBINode  *operand = pushNewNode(RBBINode::setRef);
 993         findSetFor(kAny, operand);
 994         fRB->fReverseTree->fLeftChild = operand;
 995         operand->fParent              = fRB->fReverseTree;
 996         fNodeStackPtr -= 2;
 997     }
 998
 999
1000     //
1001     // Parsing of the input RBBI rules is complete.
1002     // We now have a parse tree for the rule expressions
1003     // and a list of all UnicodeSets that are referenced.
1004     //
1005     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "symbols")) {fSymbolTable->print();}
1006     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ptree"))
1007     {
1008         RBBIDebugPrintf("Completed Forward Rules Parse Tree...\n");
1009         fRB->fForwardTree->printTree();
1010         RBBIDebugPrintf("\nCompleted Reverse Rules Parse Tree...\n");
1011         fRB->fReverseTree->printTree();
1012     }
1013
1014 }
1015
1016
1017 //---------------------------------------------------------------------------------
1018 //
1019 //  printNodeStack     for debugging...
1020 //
1021 //---------------------------------------------------------------------------------
1022 void RBBIRuleScanner::printNodeStack(const char *title) {
1023     int i;
1024     RBBIDebugPrintf("%s.  Dumping node stack...\n", title);
1025     for (i=fNodeStackPtr; i>0; i--) {fNodeStack[i]->printTree();}
1026 }
1027
1028
1029
1030
1031 //---------------------------------------------------------------------------------
1032 //
1033 //  pushNewNode   create a new RBBINode of the specified type and push it
1034 //                onto the stack of nodes.
1035 //
1036 //---------------------------------------------------------------------------------
1037 RBBINode  *RBBIRuleScanner::pushNewNode(RBBINode::NodeType  t) {
1038     fNodeStackPtr++;
1039     if (fNodeStackPtr >= kStackSize) {
1040         error(U_BRK_INTERNAL_ERROR);
1041         RBBIDebugPrintf("RBBIRuleScanner::pushNewNode - stack overflow.\n");
1042         *fRB->fStatus = U_BRK_INTERNAL_ERROR;
1043         return NULL;
1044     }
1045     fNodeStack[fNodeStackPtr] = new RBBINode(t);
1046     if (fNodeStack[fNodeStackPtr] == NULL) {
1047         *fRB->fStatus = U_MEMORY_ALLOCATION_ERROR;
1048     }
1049     return fNodeStack[fNodeStackPtr];
1050 }
1051
1052
1053
1054 //---------------------------------------------------------------------------------
1055 //
1056 //  scanSet    Construct a UnicodeSet from the text at the current scan
1057 //             position.  Advance the scan position to the first character
1058 //             after the set.
1059 //
1060 //             A new RBBI setref node referring to the set is pushed onto the node
1061 //             stack.
1062 //
1063 //             The scan position is normally under the control of the state machine
1064 //             that controls rule parsing.  UnicodeSets, however, are parsed by
1065 //             the UnicodeSet constructor, not by the RBBI rule parser.
1066 //
1067 //---------------------------------------------------------------------------------
1068 void RBBIRuleScanner::scanSet() {
1069     UnicodeSet    *uset;
1070     ParsePosition  pos;
1071     int            startPos;
1072     int            i;
1073
1074     if (U_FAILURE(*fRB->fStatus)) {
1075         return;
1076     }
1077
1078     pos.setIndex(fScanIndex);
1079     startPos = fScanIndex;
1080     UErrorCode localStatus = U_ZERO_ERROR;
1081     uset = new UnicodeSet(fRB->fRules, pos,
1082                          *fSymbolTable,
1083                          localStatus);
1084     if (U_FAILURE(localStatus)) {
1085         //  TODO:  Get more accurate position of the error from UnicodeSet's return info.
1086         //         UnicodeSet appears to not be reporting correctly at this time.
1087         RBBIDebugPrintf("UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex());
1088         error(localStatus);
1089         delete uset;
1090         return;
1091     }
1092
1093     // Verify that the set contains at least one code point.
1094     //
1095     if (uset->charAt(0) == -1) {
1096         // This set is empty.
1097         //  Make it an error, because it almost certainly is not what the user wanted.
1098         //  Also, avoids having to think about corner cases in the tree manipulation code
1099         //   that occurs later on.
1100         error(U_BRK_RULE_EMPTY_SET);
1101         delete uset;
1102         return;
1103     }
1104
1105
1106     // Advance the RBBI parse postion over the UnicodeSet pattern.
1107     //   Don't just set fScanIndex because the line/char positions maintained
1108     //   for error reporting would be thrown off.
1109     i = pos.getIndex();
1110     for (;;) {
1111         if (fNextIndex >= i) {
1112             break;
1113         }
1114         nextCharLL();
1115     }
1116
1117     if (U_SUCCESS(*fRB->fStatus)) {
1118         RBBINode         *n;
1119
1120         n = pushNewNode(RBBINode::setRef);
1121         n->fFirstPos = startPos;
1122         n->fLastPos  = fNextIndex;
1123         fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
1124         //  findSetFor() serves several purposes here:
1125         //     - Adopts storage for the UnicodeSet, will be responsible for deleting.
1126         //     - Mantains collection of all sets in use, needed later for establishing
1127         //          character categories for run time engine.
1128         //     - Eliminates mulitiple instances of the same set.
1129         //     - Creates a new uset node if necessary (if this isn't a duplicate.)
1130         findSetFor(n->fText, n, uset);
1131     }
1132
1133 }
1134
1135 U_NAMESPACE_END
1136
1137 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */