//
// file: regexcmp.cpp
//
-// Copyright (C) 2002-2004 International Business Machines Corporation and others.
+// Copyright (C) 2002-2006 International Business Machines Corporation and others.
// All Rights Reserved.
//
// This file contains the ICU regular expression compiler, which is responsible
-//----------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
//
// Constructor.
//
-//----------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : fParenStack(status)
{
fStatus = &status;
fCharNum = 0;
fQuoteMode = FALSE;
fInBackslashQuote = FALSE;
- fModeFlags = fRXPat->fFlags;
+ fModeFlags = fRXPat->fFlags | 0x80000000;
fEOLComments = TRUE;
fMatchOpenParen = -1;
-//----------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
//
// Destructor
//
-//----------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
RegexCompile::~RegexCompile() {
}
-//---------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
//
// Compile regex pattern. The state machine for rexexp pattern parsing is here.
// The state tables are hand-written in the file regexcst.txt,
// and converted to the form used here by a perl
// script regexcst.pl
//
-//---------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
void RegexCompile::compile(
const UnicodeString &pat, // Source pat to be compiled.
UParseError &pp, // Error position info
-//----------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
//
// doParseAction Do some action during regex pattern parsing.
// Called by the parse state machine.
// in functions called from the parse actions defined here.
//
//
-//----------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
UBool RegexCompile::doParseActions(EParseAction action)
{
UBool returnVal = TRUE;
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_STATE_SAVE, 2), *fStatus);
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_JMP, 3), *fStatus);
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_FAIL, 0), *fStatus);
- fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
- fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
- fParenStack.push(-1, *fStatus); // Begin a Paren Stack Frame
- fParenStack.push( 3, *fStatus); // Push location of first NOP
+ // Standard open nonCapture paren action emits the two NOPs and
+ // sets up the paren stack frame.
+ doParseActions((EParseAction)doOpenNonCaptureParen);
break;
case doPatFinish:
case doSetMatchMode:
// We've got a (?i) or similar. The match mode is being changed, but
// the change is not scoped to a parenthesized block.
+ U_ASSERT(fNewModeFlags < 0);
fModeFlags = fNewModeFlags;
// Prevent any string from spanning across the change of match mode.
fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
// Set the current mode flags to the new values.
+ U_ASSERT(fNewModeFlags < 0);
fModeFlags = fNewModeFlags;
}
break;
}
return returnVal;
-};
+}
// the compiled pattern. (Negative values are frame boundaries, and don't need fixing.)
for (loc=0; loc<fParenStack.size(); loc++) {
int32_t x = fParenStack.elementAti(loc);
+ U_ASSERT(x < code->size());
if (x>where) {
x++;
fParenStack.setElementAt(x, loc);
// This function is called both when encountering a
// real ) and at the end of the pattern.
//
-//-------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
void RegexCompile::handleCloseParen() {
int32_t patIdx;
int32_t patOp;
// the value they had at the open paren. Saved value is
// at the top of the paren stack.
fModeFlags = fParenStack.popi();
+ U_ASSERT(fModeFlags < 0);
// DO any additional fixups, depending on the specific kind of
// parentesized grouping this is
-//----------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
//
// compileSet Compile the pattern operations for a reference to a
// UnicodeSet.
//
-//----------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
void RegexCompile::compileSet(UnicodeSet *theSet)
{
if (theSet == NULL) {
}
-//----------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
//
// compileInterval Generate the code for a {min, max} style interval quantifier.
// Except for the specific opcodes used, the code is the same
// 5 CTR_LOOP
//
// In
-//----------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp)
{
// The CTR_INIT op at the top of the block with the {n,m} quantifier takes
-//----------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
//
// matchStartType Determine how a match can start.
// Used to optimize find() operations.
// op where the min match coming in is zero, add that ops possible
// starting matches to the possible starts for the overall pattern.
//
-//----------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
void RegexCompile::matchStartType() {
if (U_FAILURE(*fStatus)) {
return;
// character may have distinct cased forms. Add all of them
// to the set of possible starting match chars.
UnicodeSet s(c, c);
- s.closeOver(USET_CASE);
+ s.closeOver(USET_CASE_INSENSITIVE);
fRXPat->fInitialChars->addAll(s);
} else {
// Char has no case variants. Just add it as-is to the
int32_t stringStartIdx = URX_VAL(op);
UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
UnicodeSet s(c, c);
- s.closeOver(USET_CASE);
+ s.closeOver(USET_CASE_INSENSITIVE);
fRXPat->fInitialChars->addAll(s);
numInitialStrings += 2; // Matching on an initial string not possible.
}
-//----------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
//
// minMatchLength Calculate the length of the shortest string that could
// match the specified pattern.
// start and end are the range of p-code operations to be
// examined. The endpoints are included in the range.
//
-//----------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
if (U_FAILURE(*fStatus)) {
return 0;
-//----------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
//
// maxMatchLength Calculate the length of the longest string that could
// match the specified pattern.
// value may be longer than the actual maximum; it must
// never be shorter.
//
-//----------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
if (U_FAILURE(*fStatus)) {
return 0;
// End of look-ahead ops should always be consumed by the processing at
// the URX_LA_START op.
- U_ASSERT(FALSE);
- break;
+ // U_ASSERT(FALSE);
+ // break;
case URX_LB_START:
{
}
-//----------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
//
// stripNOPs Remove any NOP operations from the compiled pattern code.
// Extra NOPs are inserted for some constructs during the initial
// code generation to provide locations that may be patched later.
// Many end up unneeded, and are removed by this function.
//
-//----------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
void RegexCompile::stripNOPs() {
if (U_FAILURE(*fStatus)) {
-//----------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
//
// OptDotStar Optimize patterns that end with a '.*' or '.+' to
// just advance the input to the end.
// [NOP | END_CAPTURE | DOLLAR | BACKSLASH_Z]*
// END
//
-//----------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
void RegexCompile::OptDotStar() {
// Scan backwards in the pattern, looking for a JMP_SAV near the end.
int32_t jmpLoc;
}
-//----------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
//
// Error Report a rule parse error.
// Only report it if no previous error has been recorded.
//
-//----------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
void RegexCompile::error(UErrorCode e) {
if (U_SUCCESS(*fStatus)) {
*fStatus = e;
static const UChar chLF = 0x0a;
static const UChar chNEL = 0x85; // NEL newline variant
static const UChar chLS = 0x2028; // Unicode Line Separator
-static const UChar chApos = 0x27; // single quote, for quoted chars.
static const UChar chPound = 0x23; // '#', introduces a comment.
static const UChar chE = 0x45; // 'E'
+static const UChar chUpperN = 0x4E;
+static const UChar chLowerP = 0x70;
+static const UChar chUpperP = 0x50;
static const UChar chBackSlash = 0x5c; // '\' introduces a char escape
-static const UChar chLParen = 0x28;
-static const UChar chRParen = 0x29;
static const UChar chLBracket = 0x5b;
static const UChar chRBracket = 0x5d;
static const UChar chRBrace = 0x7d;
-static const UChar chUpperN = 0x4E;
-static const UChar chLowerP = 0x70;
-static const UChar chUpperP = 0x50;
-//----------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
//
// nextCharLL Low Level Next Char from the regex pattern.
// Get a char from the string, keep track of input position
// for error reporting.
//
-//----------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
UChar32 RegexCompile::nextCharLL() {
UChar32 ch;
UnicodeString &pattern = fRXPat->fPattern;
return ch;
}
-//---------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
//
// peekCharLL Low Level Character Scanning, sneak a peek at the next
// character without actually getting it.
//
-//---------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
UChar32 RegexCompile::peekCharLL() {
if (fPeekChar == -1) {
fPeekChar = nextCharLL();
}
-//---------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
//
// nextChar for pattern scanning. At this level, we handle stripping
// out comments and processing some backslash character escapes.
// The rest of the pattern grammar is handled at the next level up.
//
-//---------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
void RegexCompile::nextChar(RegexPatternChar &c) {
fScanIndex = fNextIndex;
-//---------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
//
// scanSet Construct a UnicodeSet from the text at the current scan
// position. Advance the scan position to the first character
// that controls pattern parsing. UnicodeSets, however, are parsed by
// the UnicodeSet constructor, not by the Regex pattern parser.
//
-//---------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
UnicodeSet *RegexCompile::scanSet() {
UnicodeSet *uset = NULL;
ParsePosition pos;
- int startPos;
int i;
if (U_FAILURE(*fStatus)) {
}
pos.setIndex(fScanIndex);
- startPos = fScanIndex;
UErrorCode localStatus = U_ZERO_ERROR;
uint32_t usetFlags = 0;
if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
}
return uset;
-};
+}
-//---------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
//
// scanProp Construct a UnicodeSet from the text at the current scan
// position, which will be of the form \p{whaterver}
// Return a UnicodeSet, constructed from the \P pattern,
// or NULL if the pattern is invalid.
//
-//---------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
UnicodeSet *RegexCompile::scanProp() {
UnicodeSet *uset = NULL;
nextChar(fC); // Continue overall regex pattern processing with char after the '}'
return uset;
-};
+}
U_NAMESPACE_END
#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
+