5 // Copyright (C) 2002-2003 International Business Machines Corporation and others.
6 // All Rights Reserved.
8 // This file contains the ICU regular expression compiler, which is responsible
9 // for processing a regular expression pattern into the compiled form that
10 // is used by the match finding engine.
13 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
17 #include "unicode/unistr.h"
18 #include "unicode/uniset.h"
19 #include "unicode/uchar.h"
20 #include "unicode/uchriter.h"
21 #include "unicode/parsepos.h"
22 #include "unicode/parseerr.h"
23 #include "unicode/regex.h"
33 #include "regexcst.h" // Contains state table for the regex pattern parser.
34 // generated by a Perl script.
46 //----------------------------------------------------------------------------------------
50 //----------------------------------------------------------------------------------------
51 RegexCompile::RegexCompile(RegexPattern
*rxp
, UErrorCode
&status
) : fParenStack(status
)
62 fInBackslashQuote
= FALSE
;
63 fModeFlags
= fRXPat
->fFlags
;
67 fMatchCloseParen
= -1;
70 if (U_SUCCESS(status
) && U_FAILURE(rxp
->fDeferredStatus
)) {
71 status
= rxp
->fDeferredStatus
;
77 //----------------------------------------------------------------------------------------
81 //----------------------------------------------------------------------------------------
82 RegexCompile::~RegexCompile() {
87 //----------------------------------------------------------------------------------------
89 // cleanup. Called (indirectly) by u_cleanup to free all cached memory
91 //----------------------------------------------------------------------------------------
92 void RegexCompile::cleanup() {
93 delete RegexStaticSets::gStaticSets
;
94 RegexStaticSets::gStaticSets
= NULL
;
98 //---------------------------------------------------------------------------------
100 // Compile regex pattern. The state machine for rexexp pattern parsing is here.
101 // The state tables are hand-written in the file regexcst.txt,
102 // and converted to the form used here by a perl
103 // script regexcst.pl
105 //---------------------------------------------------------------------------------
106 void RegexCompile::compile(
107 const UnicodeString
&pat
, // Source pat to be compiled.
108 UParseError
&pp
, // Error position info
109 UErrorCode
&e
) // Error Code
114 fStack
[fStackPtr
] = 0;
116 if (U_FAILURE(*fStatus
)) {
120 // There should be no pattern stuff in the RegexPattern object. They can not be reused.
121 U_ASSERT(fRXPat
->fPattern
.length() == 0);
123 // Prepare the RegexPattern object to receive the compiled pattern.
124 // TODO: remove per-instance field, and just use globals directly. (But check perf)
125 fRXPat
->fPattern
= pat
;
126 fRXPat
->fStaticSets
= RegexStaticSets::gStaticSets
->fPropSets
;
127 fRXPat
->fStaticSets8
= RegexStaticSets::gStaticSets
->fPropSets8
;
130 // Initialize the pattern scanning state machine
131 fPatternLength
= pat
.length();
133 const RegexTableEl
*tableEl
;
134 nextChar(fC
); // Fetch the first char from the pattern string.
137 // Main loop for the regex pattern parsing state machine.
138 // Runs once per state transition.
139 // Each time through optionally performs, depending on the state table,
140 // - an advance to the the next pattern char
141 // - an action to be performed.
142 // - pushing or popping a state to/from the local state return stack.
143 // file regexcst.txt is the source for the state table. The logic behind
144 // recongizing the pattern syntax is there, not here.
147 // Bail out if anything has gone wrong.
148 // Regex pattern parsing stops on the first error encountered.
149 if (U_FAILURE(*fStatus
)) {
153 U_ASSERT(state
!= 0);
155 // Find the state table element that matches the input char from the pattern, or the
156 // class of the input character. Start with the first table row for this
157 // state, then linearly scan forward until we find a row that matches the
158 // character. The last row for each state always matches all characters, so
159 // the search will stop there, if not before.
161 tableEl
= &gRuleParseStateTable
[state
];
162 REGEX_SCAN_DEBUG_PRINTF( "char, line, col = (\'%c\', %d, %d) state=%s ",
163 fC
.fChar
, fLineNum
, fCharNum
, RegexStateNames
[state
]);
165 for (;;) { // loop through table rows belonging to this state, looking for one
166 // that matches the current input char.
167 REGEX_SCAN_DEBUG_PRINTF( ".");
168 if (tableEl
->fCharClass
< 127 && fC
.fQuoted
== FALSE
&& tableEl
->fCharClass
== fC
.fChar
) {
169 // Table row specified an individual character, not a set, and
170 // the input character is not quoted, and
171 // the input character matched it.
174 if (tableEl
->fCharClass
== 255) {
175 // Table row specified default, match anything character class.
178 if (tableEl
->fCharClass
== 254 && fC
.fQuoted
) {
179 // Table row specified "quoted" and the char was quoted.
182 if (tableEl
->fCharClass
== 253 && fC
.fChar
== (UChar32
)-1) {
183 // Table row specified eof and we hit eof on the input.
187 if (tableEl
->fCharClass
>= 128 && tableEl
->fCharClass
< 240 && // Table specs a char class &&
188 fC
.fQuoted
== FALSE
&& // char is not escaped &&
189 fC
.fChar
!= (UChar32
)-1) { // char is not EOF
190 UnicodeSet
*uniset
= RegexStaticSets::gStaticSets
->fRuleSets
[tableEl
->fCharClass
-128];
191 if (uniset
->contains(fC
.fChar
)) {
192 // Table row specified a character class, or set of characters,
193 // and the current char matches it.
198 // No match on this row, advance to the next row for this state,
201 REGEX_SCAN_DEBUG_PRINTF("\n");
204 // We've found the row of the state table that matches the current input
205 // character from the rules string.
206 // Perform any action specified by this row in the state table.
207 if (doParseActions((EParseAction
)tableEl
->fAction
) == FALSE
) {
208 // Break out of the state machine loop if the
209 // the action signalled some kind of error, or
210 // the action was to exit, occurs on normal end-of-rules-input.
214 if (tableEl
->fPushState
!= 0) {
216 if (fStackPtr
>= kStackSize
) {
217 error(U_REGEX_INTERNAL_ERROR
);
218 REGEX_SCAN_DEBUG_PRINTF( "RegexCompile::parse() - state stack overflow.\n");
221 fStack
[fStackPtr
] = tableEl
->fPushState
;
225 // NextChar. This is where characters are actually fetched from the pattern.
226 // Happens under control of the 'n' tag in the state table.
228 if (tableEl
->fNextChar
) {
232 // Get the next state from the table entry, or from the
233 // state stack if the next state was specified as "pop".
234 if (tableEl
->fNextState
!= 255) {
235 state
= tableEl
->fNextState
;
237 state
= fStack
[fStackPtr
];
240 // state stack underflow
241 // This will occur if the user pattern has mis-matched parentheses,
242 // with extra close parens.
245 error(U_REGEX_MISMATCHED_PAREN
);
252 // The pattern has now been read and processed, and the compiled code generated.
255 // Back-reference fixup
258 for (loc
=0; loc
<fRXPat
->fCompiledPat
->size(); loc
++) {
259 int32_t op
= fRXPat
->fCompiledPat
->elementAti(loc
);
260 int32_t opType
= URX_TYPE(op
);
261 if (opType
== URX_BACKREF
|| opType
== URX_BACKREF_I
) {
262 int32_t where
= URX_VAL(op
);
263 if (where
> fRXPat
->fGroupMap
->size()) {
264 error(U_REGEX_INVALID_BACK_REF
);
267 where
= fRXPat
->fGroupMap
->elementAti(where
-1);
268 op
= URX_BUILD(opType
, where
);
269 fRXPat
->fCompiledPat
->setElementAt(op
, loc
);
275 // Compute the number of digits requried for the largest capture group number.
277 fRXPat
->fMaxCaptureDigits
= 1;
280 if (n
> fRXPat
->fGroupMap
->size()) {
283 fRXPat
->fMaxCaptureDigits
++;
288 // The pattern's fFrameSize so far has accumulated the requirements for
289 // storage for capture parentheses, counters, etc. that are encountered
290 // in the pattern. Add space for the two variables that are always
291 // present in the saved state: the input string position and the
292 // position in the compiled pattern.
294 fRXPat
->fFrameSize
+=2;
297 // Get bounds for the minimum and maximum length of a string that this
298 // pattern can match. Used to avoid looking for matches in strings that
301 fRXPat
->fMinMatchLen
= minMatchLength(3, fRXPat
->fCompiledPat
->size()-1);
304 // Optimization passes
311 // Set up fast latin-1 range sets
313 int32_t numSets
= fRXPat
->fSets
->size();
314 fRXPat
->fSets8
= new Regex8BitSet
[numSets
];
316 for (i
=0; i
<numSets
; i
++) {
317 UnicodeSet
*s
= (UnicodeSet
*)fRXPat
->fSets
->elementAt(i
);
318 fRXPat
->fSets8
[i
].init(s
);
322 // A stupid bit of non-sense to prevent code coverage testing from complaining
323 // about the pattern.dump() debug function. Go through the motions of dumping,
324 // even though, without the #define set, it will do nothing.
326 #ifndef REGEX_DUMP_DEBUG
327 static UBool phonyDumpDone
= FALSE
;
328 if (phonyDumpDone
==FALSE
) {
330 phonyDumpDone
= TRUE
;
340 //----------------------------------------------------------------------------------------
342 // doParseAction Do some action during regex pattern parsing.
343 // Called by the parse state machine.
345 // Generation of the match engine PCode happens here, or
346 // in functions called from the parse actions defined here.
349 //----------------------------------------------------------------------------------------
350 UBool
RegexCompile::doParseActions(EParseAction action
)
352 UBool returnVal
= TRUE
;
354 switch ((Regex_PatternParseAction
)action
) {
357 // Start of pattern compiles to:
358 //0 SAVE 2 Fall back to position of FAIL
360 //2 FAIL Stop if we ever reach here.
361 //3 NOP Dummy, so start of pattern looks the same as
362 // the start of an ( grouping.
363 //4 NOP Resreved, will be replaced by a save if there are
364 // OR | operators at the top level
365 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_STATE_SAVE
, 2), *fStatus
);
366 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_JMP
, 3), *fStatus
);
367 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_FAIL
, 0), *fStatus
);
368 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_NOP
, 0), *fStatus
);
369 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_NOP
, 0), *fStatus
);
371 fParenStack
.push(-1, *fStatus
); // Begin a Paren Stack Frame
372 fParenStack
.push( 3, *fStatus
); // Push location of first NOP
376 // We've scanned to the end of the pattern
377 // The end of pattern compiles to:
379 // which will stop the runtime match engine.
380 // Encountering end of pattern also behaves like a close paren,
381 // and forces fixups of the State Save at the beginning of the compiled pattern
382 // and of any OR operations at the top level.
385 if (fParenStack
.size() > 0) {
386 // Missing close paren in pattern.
387 error(U_REGEX_MISMATCHED_PAREN
);
390 // add the END operation to the compiled pattern.
391 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_END
, 0), *fStatus
);
393 // Terminate the pattern compilation state machine.
400 // Scanning a '|', as in (A|B)
402 // Insert a SAVE operation at the start of the pattern section preceding
403 // this OR at this level. This SAVE will branch the match forward
404 // to the right hand side of the OR in the event that the left hand
405 // side fails to match and backtracks. Locate the position for the
406 // save from the location on the top of the parentheses stack.
407 int32_t savePosition
= fParenStack
.popi();
408 int32_t op
= fRXPat
->fCompiledPat
->elementAti(savePosition
);
409 U_ASSERT(URX_TYPE(op
) == URX_NOP
); // original contents of reserved location
410 op
= URX_BUILD(URX_STATE_SAVE
, fRXPat
->fCompiledPat
->size()+1);
411 fRXPat
->fCompiledPat
->setElementAt(op
, savePosition
);
413 // Append an JMP operation into the compiled pattern. The operand for
414 // the JMP will eventually be the location following the ')' for the
415 // group. This will be patched in later, when the ')' is encountered.
416 op
= URX_BUILD(URX_JMP
, 0);
417 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
419 // Push the position of the newly added JMP op onto the parentheses stack.
420 // This registers if for fixup when this block's close paren is encountered.
421 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
);
423 // Append a NOP to the compiled pattern. This is the slot reserved
424 // for a SAVE in the event that there is yet another '|' following
426 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_NOP
, 0), *fStatus
);
427 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
);
432 case doOpenCaptureParen
:
435 // - NOP, which later may be replaced by a save-state if the
436 // parenthesized group gets a * quantifier, followed by
437 // - START_CAPTURE n where n is stack frame offset to the capture group variables.
438 // - NOP, which may later be replaced by a save-state if there
439 // is an '|' alternation within the parens.
441 // Each capture group gets three slots in the save stack frame:
442 // 0: Capture Group start position (in input string being matched.)
443 // 1: Capture Group end positino.
444 // 2: Start of Match-in-progress.
445 // The first two locations are for a completed capture group, and are
446 // referred to by back references and the like.
447 // The third location stores the capture start position when an START_CAPTURE is
448 // encountered. This will be promoted to a completed capture when (and if) the corresponding
449 // END_CAPure is encountered.
451 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_NOP
, 0), *fStatus
);
452 int32_t varsLoc
= fRXPat
->fFrameSize
; // Reserve three slots in match stack frame.
453 fRXPat
->fFrameSize
+= 3;
454 int32_t cop
= URX_BUILD(URX_START_CAPTURE
, varsLoc
);
455 fRXPat
->fCompiledPat
->addElement(cop
, *fStatus
);
456 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_NOP
, 0), *fStatus
);
458 // On the Parentheses stack, start a new frame and add the postions
459 // of the two NOPs. Depending on what follows in the pattern, the
460 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
461 // address of the end of the parenthesized group.
462 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
463 fParenStack
.push(capturing
, *fStatus
); // Frame type.
464 fParenStack
.push(fRXPat
->fCompiledPat
->size()-3, *fStatus
); // The first NOP location
465 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP loc
467 // Save the mapping from group number to stack frame variable position.
468 fRXPat
->fGroupMap
->addElement(varsLoc
, *fStatus
);
472 case doOpenNonCaptureParen
:
473 // Open non-caputuring (grouping only) Paren.
475 // - NOP, which later may be replaced by a save-state if the
476 // parenthesized group gets a * quantifier, followed by
477 // - NOP, which may later be replaced by a save-state if there
478 // is an '|' alternation within the parens.
480 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_NOP
, 0), *fStatus
);
481 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_NOP
, 0), *fStatus
);
483 // On the Parentheses stack, start a new frame and add the postions
485 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
486 fParenStack
.push(plain
, *fStatus
); // Begin a new frame.
487 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The first NOP location
488 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP loc
493 case doOpenAtomicParen
:
494 // Open Atomic Paren. (?>
496 // - NOP, which later may be replaced if the parenthesized group
497 // has a quantifier, followed by
498 // - STO_SP save state stack position, so it can be restored at the ")"
499 // - NOP, which may later be replaced by a save-state if there
500 // is an '|' alternation within the parens.
502 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_NOP
, 0), *fStatus
);
503 int32_t varLoc
= fRXPat
->fDataSize
; // Reserve a data location for saving the
504 fRXPat
->fDataSize
+= 1; // state stack ptr.
505 int32_t stoOp
= URX_BUILD(URX_STO_SP
, varLoc
);
506 fRXPat
->fCompiledPat
->addElement(stoOp
, *fStatus
);
507 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_NOP
, 0), *fStatus
);
509 // On the Parentheses stack, start a new frame and add the postions
510 // of the two NOPs. Depending on what follows in the pattern, the
511 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
512 // address of the end of the parenthesized group.
513 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
514 fParenStack
.push(atomic
, *fStatus
); // Frame type.
515 fParenStack
.push(fRXPat
->fCompiledPat
->size()-3, *fStatus
); // The first NOP
516 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP
521 case doOpenLookAhead
:
522 // Positive Look-ahead (?= stuff )
524 // 1 START_LA dataLoc
525 // 2. NOP reserved for use by quantifiers on the block.
526 // Look-ahead can't have quantifiers, but paren stack
527 // compile time conventions require the slot anyhow.
528 // 3. NOP may be replaced if there is are '|' ops in the block.
529 // 4. code for parenthesized stuff.
532 // Two data slots are reserved, for saving the stack ptr and the input position.
534 int32_t dataLoc
= fRXPat
->fDataSize
;
535 fRXPat
->fDataSize
+= 2;
536 int32_t op
= URX_BUILD(URX_LA_START
, dataLoc
);
537 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
539 op
= URX_BUILD(URX_NOP
, 0);
540 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
541 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
543 // On the Parentheses stack, start a new frame and add the postions
545 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
546 fParenStack
.push(lookAhead
, *fStatus
); // Frame type.
547 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The first NOP location
548 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP location
552 case doOpenLookAheadNeg
:
553 // Negated Lookahead. (?! stuff )
555 // 1. START_LA dataloc
556 // 2. SAVE_STATE 7 // Fail within look-ahead block restores to this state,
557 // // which continues with the match.
558 // 3. NOP // Std. Open Paren sequence, for possible '|'
559 // 4. code for parenthesized stuff.
560 // 5. END_LA // Cut back stack, remove saved state from step 2.
561 // 6. FAIL // code in block succeeded, so neg. lookahead fails.
564 int32_t dataLoc
= fRXPat
->fDataSize
;
565 fRXPat
->fDataSize
+= 2;
566 int32_t op
= URX_BUILD(URX_LA_START
, dataLoc
);
567 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
569 op
= URX_BUILD(URX_STATE_SAVE
, 0); // dest address will be patched later.
570 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
572 op
= URX_BUILD(URX_NOP
, 0);
573 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
575 // On the Parentheses stack, start a new frame and add the postions
576 // of the StateSave and NOP.
577 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
578 fParenStack
.push( negLookAhead
, *fStatus
); // Frame type
579 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The STATE_SAVE location
580 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP location
582 // Instructions #5 and #6 will be added when the ')' is encountered.
586 case doOpenLookBehind
:
588 // Compile a (?<= look-behind open paren.
591 // 0 URX_LB_START dataLoc
592 // 1 URX_LB_CONT dataLoc
595 // 4 URX_NOP Standard '(' boilerplate.
596 // 5 URX_NOP Reserved slot for use with '|' ops within (block).
597 // 6 <code for LookBehind expression>
598 // 7 URX_LB_END dataLoc # Check match len, restore input len
599 // 8 URX_LA_END dataLoc # Restore stack, input pos
601 // Allocate a block of matcher data, to contain (when running a match)
602 // 0: Stack ptr on entry
603 // 1: Input Index on entry
604 // 2: Start index of match current match attempt.
605 // 3: Original Input String len.
607 // Allocate data space
608 int32_t dataLoc
= fRXPat
->fDataSize
;
609 fRXPat
->fDataSize
+= 4;
612 int32_t op
= URX_BUILD(URX_LB_START
, dataLoc
);
613 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
616 op
= URX_BUILD(URX_LB_CONT
, dataLoc
);
617 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
618 fRXPat
->fCompiledPat
->addElement(0, *fStatus
); // MinMatchLength. To be filled later.
619 fRXPat
->fCompiledPat
->addElement(0, *fStatus
); // MaxMatchLength. To be filled later.
622 op
= URX_BUILD(URX_NOP
, 0);
623 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
624 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
626 // On the Parentheses stack, start a new frame and add the postions
627 // of the URX_LB_CONT and the NOP.
628 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
629 fParenStack
.push(lookBehind
, *fStatus
); // Frame type
630 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The first NOP location
631 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The 2nd NOP location
633 // The final two instructions will be added when the ')' is encountered.
638 case doOpenLookBehindNeg
:
640 // Compile a (?<! negated look-behind open paren.
643 // 0 URX_LB_START dataLoc # Save entry stack, input len
644 // 1 URX_LBN_CONT dataLoc # Iterate possible match positions
648 // 5 URX_NOP Standard '(' boilerplate.
649 // 6 URX_NOP Reserved slot for use with '|' ops within (block).
650 // 7 <code for LookBehind expression>
651 // 8 URX_LBN_END dataLoc # Check match len, cause a FAIL
654 // Allocate a block of matcher data, to contain (when running a match)
655 // 0: Stack ptr on entry
656 // 1: Input Index on entry
657 // 2: Start index of match current match attempt.
658 // 3: Original Input String len.
660 // Allocate data space
661 int32_t dataLoc
= fRXPat
->fDataSize
;
662 fRXPat
->fDataSize
+= 4;
665 int32_t op
= URX_BUILD(URX_LB_START
, dataLoc
);
666 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
669 op
= URX_BUILD(URX_LBN_CONT
, dataLoc
);
670 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
671 fRXPat
->fCompiledPat
->addElement(0, *fStatus
); // MinMatchLength. To be filled later.
672 fRXPat
->fCompiledPat
->addElement(0, *fStatus
); // MaxMatchLength. To be filled later.
673 fRXPat
->fCompiledPat
->addElement(0, *fStatus
); // Continue Loc. To be filled later.
676 op
= URX_BUILD(URX_NOP
, 0);
677 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
678 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
680 // On the Parentheses stack, start a new frame and add the postions
681 // of the URX_LB_CONT and the NOP.
682 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
683 fParenStack
.push(lookBehindN
, *fStatus
); // Frame type
684 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The first NOP location
685 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The 2nd NOP location
687 // The final two instructions will be added when the ')' is encountered.
691 case doConditionalExpr
:
692 // Conditionals such as (?(1)a:b)
694 // Perl inline-condtionals. (?{perl code}a|b) We're not perl, no way to do them.
695 error(U_REGEX_UNIMPLEMENTED
);
701 if (fParenStack
.size() <= 0) {
702 // Extra close paren, or missing open paren.
703 error(U_REGEX_MISMATCHED_PAREN
);
711 case doBadOpenParenType
:
713 error(U_REGEX_RULE_SYNTAX
);
717 case doMismatchedParenErr
:
718 error(U_REGEX_MISMATCHED_PAREN
);
722 // Normal '+' compiles to
723 // 1. stuff to be repeated (already built)
727 // Or, if the item to be repeated can match a zero length string,
728 // 1. STO_INP_LOC data-loc
729 // 2. body of stuff to be repeated
734 // Or, if the item to be repeated is simple
735 // 1. Item to be repeated.
736 // 2. LOOP_SR_I set number (assuming repeated item is a set ref)
737 // 3. LOOP_C stack location
739 int32_t topLoc
= blockTopLoc(FALSE
); // location of item #1
742 // Check for simple constructs, which may get special optimized code.
743 if (topLoc
== fRXPat
->fCompiledPat
->size() - 1) {
744 int32_t repeatedOp
= fRXPat
->fCompiledPat
->elementAti(topLoc
);
746 if (URX_TYPE(repeatedOp
) == URX_SETREF
) {
747 // Emit optimized code for [char set]+
748 int32_t loopOpI
= URX_BUILD(URX_LOOP_SR_I
, URX_VAL(repeatedOp
));
749 fRXPat
->fCompiledPat
->addElement(loopOpI
, *fStatus
);
750 frameLoc
= fRXPat
->fFrameSize
;
751 fRXPat
->fFrameSize
++;
752 int32_t loopOpC
= URX_BUILD(URX_LOOP_C
, frameLoc
);
753 fRXPat
->fCompiledPat
->addElement(loopOpC
, *fStatus
);
757 if (URX_TYPE(repeatedOp
) == URX_DOTANY
||
758 URX_TYPE(repeatedOp
) == URX_DOTANY_ALL
) {
759 // Emit Optimized code for .+ operations.
760 int32_t loopOpI
= URX_BUILD(URX_LOOP_DOT_I
, 0);
761 if (URX_TYPE(repeatedOp
) == URX_DOTANY_ALL
) {
762 // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
765 fRXPat
->fCompiledPat
->addElement(loopOpI
, *fStatus
);
766 frameLoc
= fRXPat
->fFrameSize
;
767 fRXPat
->fFrameSize
++;
768 int32_t loopOpC
= URX_BUILD(URX_LOOP_C
, frameLoc
);
769 fRXPat
->fCompiledPat
->addElement(loopOpC
, *fStatus
);
777 // Check for minimum match length of zero, which requires
778 // extra loop-breaking code.
779 if (minMatchLength(topLoc
, fRXPat
->fCompiledPat
->size()-1) == 0) {
780 // Zero length match is possible.
781 // Emit the code sequence that can handle it.
783 frameLoc
= fRXPat
->fFrameSize
;
784 fRXPat
->fFrameSize
++;
786 int32_t op
= URX_BUILD(URX_STO_INP_LOC
, frameLoc
);
787 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
);
789 op
= URX_BUILD(URX_JMP_SAV_X
, topLoc
+1);
790 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
792 // Simpler code when the repeated body must match something non-empty
793 int32_t jmpOp
= URX_BUILD(URX_JMP_SAV
, topLoc
);
794 fRXPat
->fCompiledPat
->addElement(jmpOp
, *fStatus
);
800 // Non-greedy '+?' compiles to
801 // 1. stuff to be repeated (already built)
805 int32_t topLoc
= blockTopLoc(FALSE
);
806 int32_t saveStateOp
= URX_BUILD(URX_STATE_SAVE
, topLoc
);
807 fRXPat
->fCompiledPat
->addElement(saveStateOp
, *fStatus
);
813 // Normal (greedy) ? quantifier.
816 // 2. body of optional block
818 // Insert the state save into the compiled pattern, and we're done.
820 int32_t saveStateLoc
= blockTopLoc(TRUE
);
821 int32_t saveStateOp
= URX_BUILD(URX_STATE_SAVE
, fRXPat
->fCompiledPat
->size());
822 fRXPat
->fCompiledPat
->setElementAt(saveStateOp
, saveStateLoc
);
827 // Non-greedy ?? quantifier
830 // 2. body of optional block
834 // This code is less than ideal, with two jmps instead of one, because we can only
835 // insert one instruction at the top of the block being iterated.
837 int32_t jmp1_loc
= blockTopLoc(TRUE
);
838 int32_t jmp2_loc
= fRXPat
->fCompiledPat
->size();
840 int32_t jmp1_op
= URX_BUILD(URX_JMP
, jmp2_loc
+1);
841 fRXPat
->fCompiledPat
->setElementAt(jmp1_op
, jmp1_loc
);
843 int32_t jmp2_op
= URX_BUILD(URX_JMP
, jmp2_loc
+2);
844 fRXPat
->fCompiledPat
->addElement(jmp2_op
, *fStatus
);
846 int32_t save_op
= URX_BUILD(URX_STATE_SAVE
, jmp1_loc
+1);
847 fRXPat
->fCompiledPat
->addElement(save_op
, *fStatus
);
853 // Normal (greedy) * quantifier.
856 // 2. body of stuff being iterated over
860 // Or, if the body is a simple [Set],
861 // 1. LOOP_SR_I set number
862 // 2. LOOP_C stack location
865 // Or if this is a .*
866 // 1. LOOP_DOT_I (. matches all mode flag)
867 // 2. LOOP_C stack location
869 // Or, if the body can match a zero-length string, to inhibit infinite loops,
871 // 2. STO_INP_LOC data-loc
876 // location of item #1, the STATE_SAVE
877 int32_t topLoc
= blockTopLoc(FALSE
);
878 int32_t dataLoc
= -1;
880 // Check for simple *, where the construct being repeated
881 // compiled to single opcode, and might be optimizable.
882 if (topLoc
== fRXPat
->fCompiledPat
->size() - 1) {
883 int32_t repeatedOp
= fRXPat
->fCompiledPat
->elementAti(topLoc
);
885 if (URX_TYPE(repeatedOp
) == URX_SETREF
) {
886 // Emit optimized code for a [char set]*
887 int32_t loopOpI
= URX_BUILD(URX_LOOP_SR_I
, URX_VAL(repeatedOp
));
888 fRXPat
->fCompiledPat
->setElementAt(loopOpI
, topLoc
);
889 dataLoc
= fRXPat
->fFrameSize
;
890 fRXPat
->fFrameSize
++;
891 int32_t loopOpC
= URX_BUILD(URX_LOOP_C
, dataLoc
);
892 fRXPat
->fCompiledPat
->addElement(loopOpC
, *fStatus
);
896 if (URX_TYPE(repeatedOp
) == URX_DOTANY
||
897 URX_TYPE(repeatedOp
) == URX_DOTANY_ALL
) {
898 // Emit Optimized code for .* operations.
899 int32_t loopOpI
= URX_BUILD(URX_LOOP_DOT_I
, 0);
900 if (URX_TYPE(repeatedOp
) == URX_DOTANY_ALL
) {
901 // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
904 fRXPat
->fCompiledPat
->setElementAt(loopOpI
, topLoc
);
905 dataLoc
= fRXPat
->fFrameSize
;
906 fRXPat
->fFrameSize
++;
907 int32_t loopOpC
= URX_BUILD(URX_LOOP_C
, dataLoc
);
908 fRXPat
->fCompiledPat
->addElement(loopOpC
, *fStatus
);
913 // Emit general case code for this *
914 // The optimizations did not apply.
916 int32_t saveStateLoc
= blockTopLoc(TRUE
);
917 int32_t jmpOp
= URX_BUILD(URX_JMP_SAV
, saveStateLoc
+1);
919 // Check for minimum match length of zero, which requires
920 // extra loop-breaking code.
921 if (minMatchLength(saveStateLoc
, fRXPat
->fCompiledPat
->size()-1) == 0) {
922 insertOp(saveStateLoc
);
923 dataLoc
= fRXPat
->fFrameSize
;
924 fRXPat
->fFrameSize
++;
926 int32_t op
= URX_BUILD(URX_STO_INP_LOC
, dataLoc
);
927 fRXPat
->fCompiledPat
->setElementAt(op
, saveStateLoc
+1);
928 jmpOp
= URX_BUILD(URX_JMP_SAV_X
, saveStateLoc
+2);
931 // Locate the position in the compiled pattern where the match will continue
932 // after completing the *. (4 or 5 in the comment above)
933 int32_t continueLoc
= fRXPat
->fCompiledPat
->size()+1;
935 // Put together the save state op store it into the compiled code.
936 int32_t saveStateOp
= URX_BUILD(URX_STATE_SAVE
, continueLoc
);
937 fRXPat
->fCompiledPat
->setElementAt(saveStateOp
, saveStateLoc
);
939 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
940 fRXPat
->fCompiledPat
->addElement(jmpOp
, *fStatus
);
945 // Non-greedy *? quantifier
948 // 2. body of stuff being iterated over
952 int32_t jmpLoc
= blockTopLoc(TRUE
); // loc 1.
953 int32_t saveLoc
= fRXPat
->fCompiledPat
->size(); // loc 3.
954 int32_t jmpOp
= URX_BUILD(URX_JMP
, saveLoc
);
955 int32_t stateSaveOp
= URX_BUILD(URX_STATE_SAVE
, jmpLoc
+1);
956 fRXPat
->fCompiledPat
->setElementAt(jmpOp
, jmpLoc
);
957 fRXPat
->fCompiledPat
->addElement(stateSaveOp
, *fStatus
);
963 // The '{' opening an interval quantifier was just scanned.
964 // Init the counter varaiables that will accumulate the values as the digits
970 case doIntevalLowerDigit
:
971 // Scanned a digit from the lower value of an {lower,upper} interval
973 int32_t digitValue
= u_charDigitValue(fC
.fChar
);
974 U_ASSERT(digitValue
>= 0);
975 fIntervalLow
= fIntervalLow
*10 + digitValue
;
976 if (fIntervalLow
< 0) {
977 error(U_REGEX_NUMBER_TOO_BIG
);
982 case doIntervalUpperDigit
:
983 // Scanned a digit from the upper value of an {lower,upper} interval
985 if (fIntervalUpper
< 0) {
988 int32_t digitValue
= u_charDigitValue(fC
.fChar
);
989 U_ASSERT(digitValue
>= 0);
990 fIntervalUpper
= fIntervalUpper
*10 + digitValue
;
991 if (fIntervalLow
< 0) {
992 error(U_REGEX_NUMBER_TOO_BIG
);
998 // Scanned a single value interval like {27}. Upper = Lower.
999 fIntervalUpper
= fIntervalLow
;
1003 // Finished scanning a normal {lower,upper} interval. Generate the code for it.
1004 if (compileInlineInterval() == FALSE
) {
1005 compileInterval(URX_CTR_INIT
, URX_CTR_LOOP
);
1009 case doPossessiveInterval
:
1010 // Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it.
1012 // Remember the loc for the top of the block being looped over.
1013 // (Can not reserve a slot in the compiled pattern at this time, becuase
1014 // compileInterval needs to reserve also, and blockTopLoc can only reserve
1016 int32_t topLoc
= blockTopLoc(FALSE
);
1018 // Produce normal looping code.
1019 compileInterval(URX_CTR_INIT
, URX_CTR_LOOP
);
1021 // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
1022 // just as if the loop was inclosed in atomic parentheses.
1024 // First the STO_SP before the start of the loop
1026 int32_t varLoc
= fRXPat
->fDataSize
; // Reserve a data location for saving the
1027 fRXPat
->fDataSize
+= 1; // state stack ptr.
1028 int32_t op
= URX_BUILD(URX_STO_SP
, varLoc
);
1029 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
);
1031 int32_t loopOp
= fRXPat
->fCompiledPat
->popi();
1032 U_ASSERT(URX_TYPE(loopOp
) == URX_CTR_LOOP
&& URX_VAL(loopOp
) == topLoc
);
1033 loopOp
++; // point LoopOp after the just-inserted STO_SP
1034 fRXPat
->fCompiledPat
->push(loopOp
, *fStatus
);
1036 // Then the LD_SP after the end of the loop
1037 op
= URX_BUILD(URX_LD_SP
, varLoc
);
1038 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1044 // Finished scanning a non-greedy {lower,upper}? interval. Generate the code for it.
1045 compileInterval(URX_CTR_INIT_NG
, URX_CTR_LOOP_NG
);
1048 case doIntervalError
:
1049 error(U_REGEX_BAD_INTERVAL
);
1053 // We've just scanned a "normal" character from the pattern,
1054 literalChar(fC
.fChar
);
1060 // scanned a ".", match any single character.
1063 if (fModeFlags
& UREGEX_DOTALL
) {
1064 op
= URX_BUILD(URX_DOTANY_ALL
, 0);
1066 op
= URX_BUILD(URX_DOTANY
, 0);
1068 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1074 int32_t op
= (fModeFlags
& UREGEX_MULTILINE
)? URX_CARET_M
: URX_CARET
;
1075 fRXPat
->fCompiledPat
->addElement(URX_BUILD(op
, 0), *fStatus
);
1082 int32_t op
= (fModeFlags
& UREGEX_MULTILINE
)? URX_DOLLAR_M
: URX_DOLLAR
;
1083 fRXPat
->fCompiledPat
->addElement(URX_BUILD(op
, 0), *fStatus
);
1088 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_CARET
, 0), *fStatus
);
1092 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_BACKSLASH_B
, 1), *fStatus
);
1096 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_BACKSLASH_B
, 0), *fStatus
);
1100 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_BACKSLASH_D
, 1), *fStatus
);
1104 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_BACKSLASH_D
, 0), *fStatus
);
1108 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_BACKSLASH_G
, 0), *fStatus
);
1112 fRXPat
->fCompiledPat
->addElement(
1113 URX_BUILD(URX_STAT_SETREF_N
, URX_ISSPACE_SET
), *fStatus
);
1117 fRXPat
->fCompiledPat
->addElement(
1118 URX_BUILD(URX_STATIC_SETREF
, URX_ISSPACE_SET
), *fStatus
);
1122 fRXPat
->fCompiledPat
->addElement(
1123 URX_BUILD(URX_STAT_SETREF_N
, URX_ISWORD_SET
), *fStatus
);
1127 fRXPat
->fCompiledPat
->addElement(
1128 URX_BUILD(URX_STATIC_SETREF
, URX_ISWORD_SET
), *fStatus
);
1132 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_BACKSLASH_X
, 0), *fStatus
);
1137 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_DOLLAR
, 0), *fStatus
);
1141 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_BACKSLASH_Z
, 0), *fStatus
);
1145 error(U_REGEX_BAD_ESCAPE_SEQUENCE
);
1154 UnicodeSet
*theSet
= scanProp();
1160 case doScanUnicodeSet
:
1162 UnicodeSet
*theSet
= scanSet();
1167 case doEnterQuoteMode
:
1168 // Just scanned a \Q. Put character scanner into quote mode.
1173 // BackReference. Somewhat unusual in that the front-end can not completely parse
1174 // the regular expression, because the number of digits to be consumed
1175 // depends on the number of capture groups that have been defined. So
1176 // we have to do it here instead.
1178 int32_t numCaptureGroups
= fRXPat
->fGroupMap
->size();
1179 int32_t groupNum
= 0;
1180 UChar32 c
= fC
.fChar
;
1183 // Loop once per digit, for max allowed number of digits in a back reference.
1184 int32_t digit
= u_charDigitValue(c
);
1185 groupNum
= groupNum
* 10 + digit
;
1186 if (groupNum
>= numCaptureGroups
) {
1190 if (RegexStaticSets::gStaticSets
->fRuleDigits
->contains(c
) == FALSE
) {
1196 // Scan of the back reference in the source regexp is complete. Now generate
1197 // the compiled code for it.
1198 // Because capture groups can be forward-referenced by back-references,
1199 // we fill the operand with the capture group number. At the end
1200 // of compilation, it will be changed to the variables location.
1201 U_ASSERT(groupNum
> 0);
1203 if (fModeFlags
& UREGEX_CASE_INSENSITIVE
) {
1204 op
= URX_BUILD(URX_BACKREF_I
, groupNum
);
1206 op
= URX_BUILD(URX_BACKREF
, groupNum
);
1208 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1214 error(U_REGEX_UNIMPLEMENTED
);
1219 case doPossessivePlus
:
1220 // Possessive ++ quantifier.
1223 // 2. body of stuff being iterated over
1229 // Note: TODO: This is pretty inefficient. A mass of saved state is built up
1230 // then unconditionally discarded. Perhaps introduce a new opcode
1234 int32_t topLoc
= blockTopLoc(TRUE
);
1235 int32_t stoLoc
= fRXPat
->fDataSize
;
1236 fRXPat
->fDataSize
++; // Reserve the data location for storing save stack ptr.
1237 int32_t op
= URX_BUILD(URX_STO_SP
, stoLoc
);
1238 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
);
1240 // Emit the STATE_SAVE
1241 op
= URX_BUILD(URX_STATE_SAVE
, fRXPat
->fCompiledPat
->size()+2);
1242 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1245 op
= URX_BUILD(URX_JMP
, topLoc
+1);
1246 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1249 op
= URX_BUILD(URX_LD_SP
, stoLoc
);
1250 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1254 case doPossessiveStar
:
1255 // Possessive *+ quantifier.
1259 // 3. body of stuff being iterated over
1263 // TODO: do something to cut back the state stack each time through the loop.
1265 // Reserve two slots at the top of the block.
1266 int32_t topLoc
= blockTopLoc(TRUE
);
1270 int32_t stoLoc
= fRXPat
->fDataSize
;
1271 fRXPat
->fDataSize
++; // Reserve the data location for storing save stack ptr.
1272 int32_t op
= URX_BUILD(URX_STO_SP
, stoLoc
);
1273 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
);
1275 // Emit the SAVE_STATE 5
1276 int32_t L7
= fRXPat
->fCompiledPat
->size()+1;
1277 op
= URX_BUILD(URX_STATE_SAVE
, L7
);
1278 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
+1);
1280 // Append the JMP operation.
1281 op
= URX_BUILD(URX_JMP
, topLoc
+1);
1282 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1284 // Emit the LD_SP loc
1285 op
= URX_BUILD(URX_LD_SP
, stoLoc
);
1286 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1290 case doPossessiveOpt
:
1291 // Possessive ?+ quantifier.
1295 // 3. body of optional block
1300 // Reserve two slots at the top of the block.
1301 int32_t topLoc
= blockTopLoc(TRUE
);
1305 int32_t stoLoc
= fRXPat
->fDataSize
;
1306 fRXPat
->fDataSize
++; // Reserve the data location for storing save stack ptr.
1307 int32_t op
= URX_BUILD(URX_STO_SP
, stoLoc
);
1308 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
);
1310 // Emit the SAVE_STATE
1311 int32_t continueLoc
= fRXPat
->fCompiledPat
->size()+1;
1312 op
= URX_BUILD(URX_STATE_SAVE
, continueLoc
);
1313 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
+1);
1316 op
= URX_BUILD(URX_LD_SP
, stoLoc
);
1317 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1322 case doBeginMatchMode
:
1323 fNewModeFlags
= fModeFlags
;
1324 fSetModeFlag
= TRUE
;
1327 case doMatchMode
: // (?i) and similar
1331 case 0x69: /* 'i' */ bit
= UREGEX_CASE_INSENSITIVE
; break;
1332 case 0x6d: /* 'm' */ bit
= UREGEX_MULTILINE
; break;
1333 case 0x73: /* 's' */ bit
= UREGEX_DOTALL
; break;
1334 case 0x78: /* 'x' */ bit
= UREGEX_COMMENTS
; break;
1335 case 0x2d: /* '-' */ fSetModeFlag
= FALSE
; break;
1337 U_ASSERT(FALSE
); // Should never happen. Other chars are filtered out
1341 fNewModeFlags
|= bit
;
1343 fNewModeFlags
&= ~bit
;
1348 case doSetMatchMode
:
1349 // We've got a (?i) or similar. The match mode is being changed, but
1350 // the change is not scoped to a parenthesized block.
1351 fModeFlags
= fNewModeFlags
;
1353 // Prevent any string from spanning across the change of match mode.
1354 // Otherwise the pattern "abc(?i)def" would make a single string of "abcdef"
1359 case doMatchModeParen
:
1360 // We've got a (?i: or similar. Begin a parenthesized block, save old
1361 // mode flags so they can be restored at the close of the block.
1364 // - NOP, which later may be replaced by a save-state if the
1365 // parenthesized group gets a * quantifier, followed by
1366 // - NOP, which may later be replaced by a save-state if there
1367 // is an '|' alternation within the parens.
1369 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_NOP
, 0), *fStatus
);
1370 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_NOP
, 0), *fStatus
);
1372 // On the Parentheses stack, start a new frame and add the postions
1373 // of the two NOPs (a normal non-capturing () frame, except for the
1374 // saving of the orignal mode flags.)
1375 fParenStack
.push(fModeFlags
, *fStatus
);
1376 fParenStack
.push(flags
, *fStatus
); // Frame Marker
1377 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The first NOP
1378 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP
1380 // Set the current mode flags to the new values.
1381 fModeFlags
= fNewModeFlags
;
1385 case doSuppressComments
:
1386 // We have just scanned a '(?'. We now need to prevent the character scanner from
1387 // treating a '#' as a to-the-end-of-line comment.
1388 // (This Perl compatibility just gets uglier and uglier to do...)
1389 fEOLComments
= FALSE
;
1396 error(U_REGEX_INTERNAL_ERROR
);
1400 if (U_FAILURE(*fStatus
)) {
1409 //------------------------------------------------------------------------------
1411 // literalChar We've encountered a literal character from the pattern,
1412 // or an escape sequence that reduces to a character.
1413 // Add it to the string containing all literal chars/strings from
1415 // If we are in a pattern string already, add the new char to it.
1416 // If we aren't in a pattern string, begin one now.
1418 //------------------------------------------------------------------------------
1419 void RegexCompile::literalChar(UChar32 c
) {
1420 int32_t op
; // An operation in the compiled pattern.
1422 int32_t patternLoc
; // A position in the compiled pattern.
1426 // If the last thing compiled into the pattern was not a literal char,
1427 // force this new literal char to begin a new string, and not append to the previous.
1428 op
= fRXPat
->fCompiledPat
->lastElementi();
1429 opType
= URX_TYPE(op
);
1430 if (!(opType
== URX_STRING_LEN
|| opType
== URX_ONECHAR
|| opType
== URX_ONECHAR_I
)) {
1434 if (fStringOpStart
== -1) {
1435 // First char of a string in the pattern.
1436 // Emit a OneChar op into the compiled pattern.
1439 // Also add it to the string pool, in case we get a second adjacent literal
1440 // and want to change form ONE_CHAR to STRING
1441 fStringOpStart
= fRXPat
->fLiteralText
.length();
1442 fRXPat
->fLiteralText
.append(c
);
1446 // We are adding onto an existing string
1447 fRXPat
->fLiteralText
.append(c
);
1449 op
= fRXPat
->fCompiledPat
->lastElementi();
1450 opType
= URX_TYPE(op
);
1451 U_ASSERT(opType
== URX_ONECHAR
|| opType
== URX_ONECHAR_I
|| opType
== URX_STRING_LEN
);
1453 // If the most recently emitted op is a URX_ONECHAR,
1454 if (opType
== URX_ONECHAR
|| opType
== URX_ONECHAR_I
) {
1455 if (U16_IS_TRAIL(c
) && U16_IS_LEAD(URX_VAL(op
))) {
1456 // The most recently emitted op is a ONECHAR that was the first half
1457 // of a surrogate pair. Update the ONECHAR's operand to be the
1458 // supplementary code point resulting from both halves of the pair.
1459 c
= U16_GET_SUPPLEMENTARY(URX_VAL(op
), c
);
1460 op
= URX_BUILD(opType
, c
);
1461 patternLoc
= fRXPat
->fCompiledPat
->size() - 1;
1462 fRXPat
->fCompiledPat
->setElementAt(op
, patternLoc
);
1466 // The most recently emitted op is a ONECHAR.
1467 // We've now received another adjacent char. Change the ONECHAR op
1469 if (fModeFlags
& UREGEX_CASE_INSENSITIVE
) {
1470 op
= URX_BUILD(URX_STRING_I
, fStringOpStart
);
1472 op
= URX_BUILD(URX_STRING
, fStringOpStart
);
1474 patternLoc
= fRXPat
->fCompiledPat
->size() - 1;
1475 fRXPat
->fCompiledPat
->setElementAt(op
, patternLoc
);
1476 op
= URX_BUILD(URX_STRING_LEN
, 0);
1477 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1480 // The pattern contains a URX_SRING / URX_STRING_LEN. Update the
1481 // string length to reflect the new char we just added to the string.
1482 stringLen
= fRXPat
->fLiteralText
.length() - fStringOpStart
;
1483 op
= URX_BUILD(URX_STRING_LEN
, stringLen
);
1484 patternLoc
= fRXPat
->fCompiledPat
->size() - 1;
1485 fRXPat
->fCompiledPat
->setElementAt(op
, patternLoc
);
1490 //------------------------------------------------------------------------------
1492 // emitONE_CHAR emit a ONE_CHAR op into the generated code.
1493 // Choose cased or uncased version, depending on the
1494 // match mode and whether the character itself is cased.
1496 //------------------------------------------------------------------------------
1497 void RegexCompile::emitONE_CHAR(UChar32 c
) {
1499 if ((fModeFlags
& UREGEX_CASE_INSENSITIVE
) &&
1500 u_hasBinaryProperty(c
, UCHAR_CASE_SENSITIVE
)) {
1501 // We have a cased character, and are in case insensitive matching mode.
1502 c
= u_foldCase(c
, U_FOLD_CASE_DEFAULT
);
1503 op
= URX_BUILD(URX_ONECHAR_I
, c
);
1505 // Uncased char, or case sensitive match mode.
1506 // Either way, just generate a literal compare of the char.
1507 op
= URX_BUILD(URX_ONECHAR
, c
);
1509 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1513 //------------------------------------------------------------------------------
1515 // fixLiterals When compiling something that can follow a literal
1516 // string in a pattern, we need to "fix" any preceding
1517 // string, which will cause any subsequent literals to
1518 // begin a new string, rather than appending to the
1521 // Optionally, split the last char of the string off into
1522 // a single "ONE_CHAR" operation, so that quantifiers can
1523 // apply to that char alone. Example: abc*
1524 // The * must apply to the 'c' only.
1526 //------------------------------------------------------------------------------
1527 void RegexCompile::fixLiterals(UBool split
) {
1528 int32_t stringStart
= fStringOpStart
; // start index of the current literal string
1529 int32_t op
; // An op from/for the compiled pattern.
1530 int32_t opType
; // An opcode type from the compiled pattern.
1531 int32_t stringLastCharIdx
;
1533 int32_t stringNextToLastCharIdx
;
1534 UChar32 nextToLastChar
;
1537 fStringOpStart
= -1;
1542 // Split: We need to ensure that the last item in the compiled pattern does
1543 // not refer to a literal string of more than one char. If it does,
1544 // separate the last char from the rest of the string.
1546 // If the last operation from the compiled pattern is not a string,
1547 // nothing needs to be done
1548 op
= fRXPat
->fCompiledPat
->lastElementi();
1549 opType
= URX_TYPE(op
);
1550 if (opType
!= URX_STRING_LEN
) {
1553 stringLen
= URX_VAL(op
);
1556 // Find the position of the last code point in the string (might be a surrogate pair)
1558 stringLastCharIdx
= fRXPat
->fLiteralText
.length();
1559 stringLastCharIdx
= fRXPat
->fLiteralText
.moveIndex32(stringLastCharIdx
, -1);
1560 lastChar
= fRXPat
->fLiteralText
.char32At(stringLastCharIdx
);
1562 // The string should always be at least two code points long, meaning that there
1563 // should be something before the last char position that we just found.
1564 U_ASSERT(stringLastCharIdx
> stringStart
);
1565 stringNextToLastCharIdx
= fRXPat
->fLiteralText
.moveIndex32(stringLastCharIdx
, -1);
1566 U_ASSERT(stringNextToLastCharIdx
>= stringStart
);
1567 nextToLastChar
= fRXPat
->fLiteralText
.char32At(stringNextToLastCharIdx
);
1569 if (stringNextToLastCharIdx
> stringStart
) {
1570 // The length of string remaining after removing one char is two or more.
1571 // Leave the string in the compiled pattern, shorten it by one char,
1572 // and append a URX_ONECHAR op for the last char.
1573 stringLen
-= (fRXPat
->fLiteralText
.length() - stringLastCharIdx
);
1574 op
= URX_BUILD(URX_STRING_LEN
, stringLen
);
1575 fRXPat
->fCompiledPat
->setElementAt(op
, fRXPat
->fCompiledPat
->size() -1);
1576 emitONE_CHAR(lastChar
);
1578 // The original string consisted of exactly two characters. Replace
1579 // the existing compiled URX_STRING/URX_STRING_LEN ops with a pair
1581 fRXPat
->fCompiledPat
->setSize(fRXPat
->fCompiledPat
->size() -2);
1582 emitONE_CHAR(nextToLastChar
);
1583 emitONE_CHAR(lastChar
);
1592 //------------------------------------------------------------------------------
1594 // insertOp() Insert a slot for a new opcode into the already
1595 // compiled pattern code.
1597 // Fill the slot with a NOP. Our caller will replace it
1598 // with what they really wanted.
1600 //------------------------------------------------------------------------------
1601 void RegexCompile::insertOp(int32_t where
) {
1602 UVector32
*code
= fRXPat
->fCompiledPat
;
1603 U_ASSERT(where
>0 && where
< code
->size());
1605 int32_t nop
= URX_BUILD(URX_NOP
, 0);
1606 code
->insertElementAt(nop
, where
, *fStatus
);
1608 // Walk through the pattern, looking for any ops with targets that
1609 // were moved down by the insert. Fix them.
1611 for (loc
=0; loc
<code
->size(); loc
++) {
1612 int32_t op
= code
->elementAti(loc
);
1613 int32_t opType
= URX_TYPE(op
);
1614 int32_t opValue
= URX_VAL(op
);
1615 if ((opType
== URX_JMP
||
1616 opType
== URX_JMPX
||
1617 opType
== URX_STATE_SAVE
||
1618 opType
== URX_CTR_LOOP
||
1619 opType
== URX_CTR_LOOP_NG
||
1620 opType
== URX_JMP_SAV
||
1621 opType
== URX_RELOC_OPRND
) && opValue
> where
) {
1622 // Target location for this opcode is after the insertion point and
1623 // needs to be incremented to adjust for the insertion.
1625 op
= URX_BUILD(opType
, opValue
);
1626 code
->setElementAt(op
, loc
);
1630 // Now fix up the parentheses stack. All positive values in it are locations in
1631 // the compiled pattern. (Negative values are frame boundaries, and don't need fixing.)
1632 for (loc
=0; loc
<fParenStack
.size(); loc
++) {
1633 int32_t x
= fParenStack
.elementAti(loc
);
1636 fParenStack
.setElementAt(x
, loc
);
1640 if (fMatchCloseParen
> where
) {
1643 if (fMatchOpenParen
> where
) {
1650 //------------------------------------------------------------------------------
1652 // blockTopLoc() Find or create a location in the compiled pattern
1653 // at the start of the operation or block that has
1654 // just been compiled. Needed when a quantifier (* or
1655 // whatever) appears, and we need to add an operation
1656 // at the start of the thing being quantified.
1658 // (Parenthesized Blocks) have a slot with a NOP that
1659 // is reserved for this purpose. .* or similar don't
1660 // and a slot needs to be added.
1662 // parameter reserveLoc : TRUE - ensure that there is space to add an opcode
1663 // at the returned location.
1664 // FALSE - just return the address,
1665 // do not reserve a location there.
1667 //------------------------------------------------------------------------------
1668 int32_t RegexCompile::blockTopLoc(UBool reserveLoc
) {
1670 if (fRXPat
->fCompiledPat
->size() == fMatchCloseParen
)
1672 // The item just processed is a parenthesized block.
1673 theLoc
= fMatchOpenParen
; // A slot is already reserved for us.
1674 U_ASSERT(theLoc
> 0);
1675 uint32_t opAtTheLoc
= fRXPat
->fCompiledPat
->elementAti(theLoc
);
1676 U_ASSERT(URX_TYPE(opAtTheLoc
) == URX_NOP
);
1679 // Item just compiled is a single thing, a ".", or a single char, or a set reference.
1680 // No slot for STATE_SAVE was pre-reserved in the compiled code.
1681 // We need to make space now.
1682 fixLiterals(TRUE
); // If last item was a string, separate the last char.
1683 theLoc
= fRXPat
->fCompiledPat
->size()-1;
1685 int32_t opAtTheLoc
= fRXPat
->fCompiledPat
->elementAti(theLoc
);
1686 int32_t nop
= URX_BUILD(URX_NOP
, 0);
1687 fRXPat
->fCompiledPat
->insertElementAt(nop
, theLoc
, *fStatus
);
1695 //------------------------------------------------------------------------------
1697 // handleCloseParen When compiling a close paren, we need to go back
1698 // and fix up any JMP or SAVE operations within the
1699 // parenthesized block that need to target the end
1700 // of the block. The locations of these are kept on
1701 // the paretheses stack.
1703 // This function is called both when encountering a
1704 // real ) and at the end of the pattern.
1706 //-------------------------------------------------------------------------------
1707 void RegexCompile::handleCloseParen() {
1710 if (fParenStack
.size() <= 0) {
1711 error(U_REGEX_MISMATCHED_PAREN
);
1715 // Force any literal chars that may follow the close paren to start a new string,
1716 // and not attach to any preceding it.
1719 // Fixup any operations within the just-closed parenthesized group
1720 // that need to reference the end of the (block).
1721 // (The first one popped from the stack is an unused slot for
1722 // alternation (OR) state save, but applying the fixup to it does no harm.)
1724 patIdx
= fParenStack
.popi();
1726 // value < 0 flags the start of the frame on the paren stack.
1729 U_ASSERT(patIdx
>0 && patIdx
<= fRXPat
->fCompiledPat
->size());
1730 patOp
= fRXPat
->fCompiledPat
->elementAti(patIdx
);
1731 U_ASSERT(URX_VAL(patOp
) == 0); // Branch target for JMP should not be set.
1732 patOp
|= fRXPat
->fCompiledPat
->size(); // Set it now.
1733 fRXPat
->fCompiledPat
->setElementAt(patOp
, patIdx
);
1734 fMatchOpenParen
= patIdx
;
1737 // At the close of any parenthesized block, restore the match mode flags to
1738 // the value they had at the open paren. Saved value is
1739 // at the top of the paren stack.
1740 fModeFlags
= fParenStack
.popi();
1742 // DO any additional fixups, depending on the specific kind of
1743 // parentesized grouping this is
1748 // No additional fixups required.
1749 // (Grouping-only parentheses)
1752 // Capturing Parentheses.
1753 // Insert a End Capture op into the pattern.
1754 // The frame offset of the variables for this cg is obtained from the
1755 // start capture op and put it into the end-capture op.
1757 int32_t captureOp
= fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
+1);
1758 U_ASSERT(URX_TYPE(captureOp
) == URX_START_CAPTURE
);
1760 int32_t frameVarLocation
= URX_VAL(captureOp
);
1761 int32_t endCaptureOp
= URX_BUILD(URX_END_CAPTURE
, frameVarLocation
);
1762 fRXPat
->fCompiledPat
->addElement(endCaptureOp
, *fStatus
);
1766 // Atomic Parenthesis.
1767 // Insert a LD_SP operation to restore the state stack to the position
1768 // it was when the atomic parens were entered.
1770 int32_t stoOp
= fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
+1);
1771 U_ASSERT(URX_TYPE(stoOp
) == URX_STO_SP
);
1772 int32_t stoLoc
= URX_VAL(stoOp
);
1773 int32_t ldOp
= URX_BUILD(URX_LD_SP
, stoLoc
);
1774 fRXPat
->fCompiledPat
->addElement(ldOp
, *fStatus
);
1780 int32_t startOp
= fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
-1);
1781 U_ASSERT(URX_TYPE(startOp
) == URX_LA_START
);
1782 int32_t dataLoc
= URX_VAL(startOp
);
1783 int32_t op
= URX_BUILD(URX_LA_END
, dataLoc
);
1784 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1790 // See comment at doOpenLookAheadNeg
1791 int32_t startOp
= fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
-1);
1792 U_ASSERT(URX_TYPE(startOp
) == URX_LA_START
);
1793 int32_t dataLoc
= URX_VAL(startOp
);
1794 int32_t op
= URX_BUILD(URX_LA_END
, dataLoc
);
1795 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1796 op
= URX_BUILD(URX_FAIL
, 0);
1797 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1799 // Patch the URX_SAVE near the top of the block.
1800 int32_t saveOp
= fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
);
1801 U_ASSERT(URX_TYPE(saveOp
) == URX_STATE_SAVE
);
1802 int32_t dest
= fRXPat
->fCompiledPat
->size();
1803 saveOp
= URX_BUILD(URX_STATE_SAVE
, dest
);
1804 fRXPat
->fCompiledPat
->setElementAt(saveOp
, fMatchOpenParen
);
1810 // See comment at doOpenLookBehind.
1812 // Append the URX_LB_END and URX_LA_END to the compiled pattern.
1813 int32_t startOp
= fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
-4);
1814 U_ASSERT(URX_TYPE(startOp
) == URX_LB_START
);
1815 int32_t dataLoc
= URX_VAL(startOp
);
1816 int32_t op
= URX_BUILD(URX_LB_END
, dataLoc
);
1817 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1818 op
= URX_BUILD(URX_LA_END
, dataLoc
);
1819 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1821 // Determine the min and max bounds for the length of the
1822 // string that the pattern can match.
1823 // An unbounded upper limit is an error.
1824 int32_t patEnd
= fRXPat
->fCompiledPat
->size() - 1;
1825 int32_t minML
= minMatchLength(fMatchOpenParen
, patEnd
);
1826 int32_t maxML
= maxMatchLength(fMatchOpenParen
, patEnd
);
1827 if (maxML
== INT32_MAX
) {
1828 error(U_REGEX_LOOK_BEHIND_LIMIT
);
1831 U_ASSERT(minML
<= maxML
);
1833 // Insert the min and max match len bounds into the URX_LB_CONT op that
1834 // appears at the top of the look-behind block, at location fMatchOpenParen+1
1835 fRXPat
->fCompiledPat
->setElementAt(minML
, fMatchOpenParen
-2);
1836 fRXPat
->fCompiledPat
->setElementAt(maxML
, fMatchOpenParen
-1);
1845 // See comment at doOpenLookBehindNeg.
1847 // Append the URX_LBN_END to the compiled pattern.
1848 int32_t startOp
= fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
-5);
1849 U_ASSERT(URX_TYPE(startOp
) == URX_LB_START
);
1850 int32_t dataLoc
= URX_VAL(startOp
);
1851 int32_t op
= URX_BUILD(URX_LBN_END
, dataLoc
);
1852 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1854 // Determine the min and max bounds for the length of the
1855 // string that the pattern can match.
1856 // An unbounded upper limit is an error.
1857 int32_t patEnd
= fRXPat
->fCompiledPat
->size() - 1;
1858 int32_t minML
= minMatchLength(fMatchOpenParen
, patEnd
);
1859 int32_t maxML
= maxMatchLength(fMatchOpenParen
, patEnd
);
1860 if (maxML
== INT32_MAX
) {
1861 error(U_REGEX_LOOK_BEHIND_LIMIT
);
1864 U_ASSERT(minML
<= maxML
);
1866 // Insert the min and max match len bounds into the URX_LB_CONT op that
1867 // appears at the top of the look-behind block, at location fMatchOpenParen+1
1868 fRXPat
->fCompiledPat
->setElementAt(minML
, fMatchOpenParen
-3);
1869 fRXPat
->fCompiledPat
->setElementAt(maxML
, fMatchOpenParen
-2);
1871 // Insert the pattern location to continue at after a successful match
1872 // as the last operand of the URX_LBN_CONT
1873 op
= URX_BUILD(URX_RELOC_OPRND
, fRXPat
->fCompiledPat
->size());
1874 fRXPat
->fCompiledPat
->setElementAt(op
, fMatchOpenParen
-1);
1884 // remember the next location in the compiled pattern.
1885 // The compilation of Quantifiers will look at this to see whether its looping
1886 // over a parenthesized block or a single item
1887 fMatchCloseParen
= fRXPat
->fCompiledPat
->size();
1892 //----------------------------------------------------------------------------------------
1894 // compileSet Compile the pattern operations for a reference to a
1897 //----------------------------------------------------------------------------------------
1898 void RegexCompile::compileSet(UnicodeSet
*theSet
)
1900 if (theSet
== NULL
) {
1903 int32_t setSize
= theSet
->size();
1904 UChar32 firstSetChar
= theSet
->charAt(0);
1905 if (firstSetChar
== -1) {
1906 // Sets that contain only strings, but no individual chars,
1907 // will end up here.
1908 error(U_REGEX_SET_CONTAINS_STRING
);
1915 // Set of no elements. Always fails to match.
1916 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_BACKTRACK
, 0), *fStatus
);
1923 // The set contains only a single code point. Put it into
1924 // the compiled pattern as a single char operation rather
1925 // than a set, and discard the set itself.
1926 literalChar(firstSetChar
);
1933 // The set contains two or more chars. (the normal case)
1934 // Put it into the compiled pattern as a set.
1935 int32_t setNumber
= fRXPat
->fSets
->size();
1936 fRXPat
->fSets
->addElement(theSet
, *fStatus
);
1937 int32_t setOp
= URX_BUILD(URX_SETREF
, setNumber
);
1938 fRXPat
->fCompiledPat
->addElement(setOp
, *fStatus
);
1944 //----------------------------------------------------------------------------------------
1946 // compileInterval Generate the code for a {min, max} style interval quantifier.
1947 // Except for the specific opcodes used, the code is the same
1948 // for all three types (greedy, non-greedy, possessive) of
1949 // intervals. The opcodes are supplied as parameters.
1951 // The code for interval loops has this form:
1952 // 0 CTR_INIT counter loc (in stack frame)
1953 // 1 5 patt address of CTR_LOOP at bottom of block
1955 // 3 max count (-1 for unbounded)
1956 // 4 ... block to be iterated over
1960 //----------------------------------------------------------------------------------------
1961 void RegexCompile::compileInterval(int32_t InitOp
, int32_t LoopOp
)
1963 // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
1964 // four slots in the compiled code. Reserve them.
1965 int32_t topOfBlock
= blockTopLoc(TRUE
);
1966 insertOp(topOfBlock
);
1967 insertOp(topOfBlock
);
1968 insertOp(topOfBlock
);
1970 // The operands for the CTR_INIT opcode include the index in the matcher data
1971 // of the counter. Allocate it now.
1972 int32_t counterLoc
= fRXPat
->fFrameSize
;
1973 fRXPat
->fFrameSize
++;
1975 int32_t op
= URX_BUILD(InitOp
, counterLoc
);
1976 fRXPat
->fCompiledPat
->setElementAt(op
, topOfBlock
);
1978 // The second operand of CTR_INIT is the location following the end of the loop.
1979 // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
1980 // compilation of something later on causes the code to grow and the target
1981 // position to move.
1982 int32_t loopEnd
= fRXPat
->fCompiledPat
->size();
1983 op
= URX_BUILD(URX_RELOC_OPRND
, loopEnd
);
1984 fRXPat
->fCompiledPat
->setElementAt(op
, topOfBlock
+1);
1986 // Followed by the min and max counts.
1987 fRXPat
->fCompiledPat
->setElementAt(fIntervalLow
, topOfBlock
+2);
1988 fRXPat
->fCompiledPat
->setElementAt(fIntervalUpper
, topOfBlock
+3);
1990 // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op.
1991 // Goes at end of the block being looped over, so just append to the code so far.
1992 op
= URX_BUILD(LoopOp
, topOfBlock
);
1993 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1995 if (fIntervalLow
> fIntervalUpper
&& fIntervalUpper
!= -1) {
1996 error(U_REGEX_MAX_LT_MIN
);
2005 UBool
RegexCompile::compileInlineInterval() {
2006 if (fIntervalUpper
> 10 || fIntervalUpper
< fIntervalLow
) {
2007 // Too big to inline. Fail, which will cause looping code to be generated.
2008 // (Upper < Lower picks up unbounded upper and errors, both.)
2012 int32_t topOfBlock
= blockTopLoc(FALSE
);
2013 if (fIntervalUpper
== 0) {
2014 // Pathological case. Attempt no matches, as if the block doesn't exist.
2015 fRXPat
->fCompiledPat
->setSize(topOfBlock
);
2019 if (topOfBlock
!= fRXPat
->fCompiledPat
->size()-1 && fIntervalUpper
!= 1) {
2020 // The thing being repeated is not a single op, but some
2021 // more complex block. Do it as a loop, not inlines.
2022 // Note that things "repeated" a max of once are handled as inline, because
2023 // the one copy of the code already generated is just fine.
2027 // Pick up the opcode that is to be repeated
2029 int32_t op
= fRXPat
->fCompiledPat
->elementAti(topOfBlock
);
2031 // Compute the pattern location where the inline sequence
2032 // will end, and set up the state save op that will be needed.
2034 int32_t endOfSequenceLoc
= fRXPat
->fCompiledPat
->size()-1
2035 + fIntervalUpper
+ (fIntervalUpper
-fIntervalLow
);
2036 int32_t saveOp
= URX_BUILD(URX_STATE_SAVE
, endOfSequenceLoc
);
2037 if (fIntervalLow
== 0) {
2038 insertOp(topOfBlock
);
2039 fRXPat
->fCompiledPat
->setElementAt(saveOp
, topOfBlock
);
2044 // Loop, emitting the op for the thing being repeated each time.
2045 // Loop starts at 1 because one instance of the op already exists in the pattern,
2046 // it was put there when it was originally encountered.
2048 for (i
=1; i
<fIntervalUpper
; i
++ ) {
2049 if (i
== fIntervalLow
) {
2050 fRXPat
->fCompiledPat
->addElement(saveOp
, *fStatus
);
2052 if (i
> fIntervalLow
) {
2053 fRXPat
->fCompiledPat
->addElement(saveOp
, *fStatus
);
2055 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
2062 //----------------------------------------------------------------------------------------
2064 // matchStartType Determine how a match can start.
2065 // Used to optimize find() operations.
2067 // Operation is very similar to minMatchLength(). Walk the compiled
2068 // pattern, keeping an on-going minimum-match-length. For any
2069 // op where the min match coming in is zero, add that ops possible
2070 // starting matches to the possible starts for the overall pattern.
2072 //----------------------------------------------------------------------------------------
2073 void RegexCompile::matchStartType() {
2074 if (U_FAILURE(*fStatus
)) {
2079 int32_t loc
; // Location in the pattern of the current op being processed.
2080 int32_t op
; // The op being processed
2081 int32_t opType
; // The opcode type of the op
2082 int32_t currentLen
= 0; // Minimum length of a match to this point (loc) in the pattern
2083 int32_t numInitialStrings
= 0; // Number of strings encountered that could match at start.
2085 UBool atStart
= TRUE
; // True if no part of the pattern yet encountered
2086 // could have advanced the position in a match.
2087 // (Maximum match length so far == 0)
2089 // forwardedLength is a vector holding minimum-match-length values that
2090 // are propagated forward in the pattern by JMP or STATE_SAVE operations.
2091 // It must be one longer than the pattern being checked because some ops
2092 // will jmp to a end-of-block+1 location from within a block, and we must
2093 // count those when checking the block.
2094 int32_t end
= fRXPat
->fCompiledPat
->size();
2095 UVector32
forwardedLength(end
+1, *fStatus
);
2096 forwardedLength
.setSize(end
+1);
2097 for (loc
=3; loc
<end
; loc
++) {
2098 forwardedLength
.setElementAt(INT32_MAX
, loc
);
2101 for (loc
= 3; loc
<end
; loc
++) {
2102 op
= fRXPat
->fCompiledPat
->elementAti(loc
);
2103 opType
= URX_TYPE(op
);
2105 // The loop is advancing linearly through the pattern.
2106 // If the op we are now at was the destination of a branch in the pattern,
2107 // and that path has a shorter minimum length than the current accumulated value,
2108 // replace the current accumulated value.
2109 U_ASSERT(currentLen
>=0 && currentLen
< INT32_MAX
);
2110 if (forwardedLength
.elementAti(loc
) < currentLen
) {
2111 currentLen
= forwardedLength
.elementAti(loc
);
2112 U_ASSERT(currentLen
>=0 && currentLen
< INT32_MAX
);
2116 // Ops that don't change the total length matched
2117 case URX_RESERVED_OP
:
2119 case URX_STRING_LEN
:
2121 case URX_START_CAPTURE
:
2122 case URX_END_CAPTURE
:
2123 case URX_BACKSLASH_B
:
2124 case URX_BACKSLASH_G
:
2125 case URX_BACKSLASH_Z
:
2127 case URX_RELOC_OPRND
:
2128 case URX_STO_INP_LOC
:
2131 case URX_BACKREF
: // BackRef. Must assume that it might be a zero length match
2134 case URX_STO_SP
: // Setup for atomic or possessive blocks. Doesn't change what can match.
2140 fRXPat
->fStartType
= START_START
;
2146 fRXPat
->fStartType
= START_LINE
;
2151 if (currentLen
== 0) {
2152 // This character could appear at the start of a match.
2153 // Add it to the set of possible starting characters.
2154 fRXPat
->fInitialChars
->add(URX_VAL(op
));
2155 numInitialStrings
+= 2;
2163 if (currentLen
== 0) {
2164 int32_t sn
= URX_VAL(op
);
2165 U_ASSERT(sn
> 0 && sn
< fRXPat
->fSets
->size());
2166 const UnicodeSet
*s
= (UnicodeSet
*)fRXPat
->fSets
->elementAt(sn
);
2167 fRXPat
->fInitialChars
->addAll(*s
);
2168 numInitialStrings
+= 2;
2175 // [Set]*, like a SETREF, above, in what it can match,
2176 // but may not match at all, so currentLen is not incremented.
2177 if (currentLen
== 0) {
2178 int32_t sn
= URX_VAL(op
);
2179 U_ASSERT(sn
> 0 && sn
< fRXPat
->fSets
->size());
2180 const UnicodeSet
*s
= (UnicodeSet
*)fRXPat
->fSets
->elementAt(sn
);
2181 fRXPat
->fInitialChars
->addAll(*s
);
2182 numInitialStrings
+= 2;
2187 case URX_LOOP_DOT_I
:
2188 if (currentLen
== 0) {
2189 // .* at the start of a pattern.
2190 // Any character can begin the match.
2191 fRXPat
->fInitialChars
->clear();
2192 fRXPat
->fInitialChars
->complement();
2193 numInitialStrings
+= 2;
2199 case URX_STATIC_SETREF
:
2200 if (currentLen
== 0) {
2201 int32_t sn
= URX_VAL(op
);
2202 U_ASSERT(sn
>0 && sn
<URX_LAST_SET
);
2203 const UnicodeSet
*s
= fRXPat
->fStaticSets
[sn
];
2204 fRXPat
->fInitialChars
->addAll(*s
);
2205 numInitialStrings
+= 2;
2213 case URX_STAT_SETREF_N
:
2214 if (currentLen
== 0) {
2215 int32_t sn
= URX_VAL(op
);
2216 const UnicodeSet
*s
= fRXPat
->fStaticSets
[sn
];
2219 fRXPat
->fInitialChars
->addAll(sc
);
2220 numInitialStrings
+= 2;
2228 case URX_BACKSLASH_D
:
2230 if (currentLen
== 0) {
2232 s
.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK
, U_GC_ND_MASK
, *fStatus
);
2233 if (URX_VAL(op
) != 0) {
2236 fRXPat
->fInitialChars
->addAll(s
);
2237 numInitialStrings
+= 2;
2245 // Case Insensitive Single Character.
2246 if (currentLen
== 0) {
2247 UChar32 c
= URX_VAL(op
);
2248 if (u_hasBinaryProperty(c
, UCHAR_CASE_SENSITIVE
)) {
2249 // character may have distinct cased forms. Add all of them
2250 // to the set of possible starting match chars.
2252 s
.closeOver(USET_CASE
);
2253 fRXPat
->fInitialChars
->addAll(s
);
2255 // Char has no case variants. Just add it as-is to the
2256 // set of possible starting chars.
2257 fRXPat
->fInitialChars
->add(c
);
2259 numInitialStrings
+= 2;
2266 case URX_BACKSLASH_X
: // Grahpeme Cluster. Minimum is 1, max unbounded.
2267 case URX_DOTANY_ALL
: // . matches one or two.
2269 case URX_DOTANY_ALL_PL
:
2271 if (currentLen
== 0) {
2272 // These constructs are all bad news when they appear at the start
2273 // of a match. Any character can begin the match.
2274 fRXPat
->fInitialChars
->clear();
2275 fRXPat
->fInitialChars
->complement();
2276 numInitialStrings
+= 2;
2284 loc
++; // Except for extra operand on URX_JMPX, same as URX_JMP.
2287 int32_t jmpDest
= URX_VAL(op
);
2288 if (jmpDest
< loc
) {
2289 // Loop of some kind. Can safely ignore, the worst that will happen
2290 // is that we understate the true minimum length
2291 currentLen
= forwardedLength
.elementAti(loc
+1);
2294 // Forward jump. Propagate the current min length to the target loc of the jump.
2295 U_ASSERT(jmpDest
<= end
+1);
2296 if (forwardedLength
.elementAti(jmpDest
) > currentLen
) {
2297 forwardedLength
.setElementAt(currentLen
, jmpDest
);
2306 // Combo of state save to the next loc, + jmp backwards.
2307 // Net effect on min. length computation is nothing.
2312 // Fails are kind of like a branch, except that the min length was
2313 // propagated already, by the state save.
2314 currentLen
= forwardedLength
.elementAti(loc
+1);
2319 case URX_STATE_SAVE
:
2321 // State Save, for forward jumps, propagate the current minimum.
2322 // of the state save.
2323 int32_t jmpDest
= URX_VAL(op
);
2324 if (jmpDest
> loc
) {
2325 if (currentLen
< forwardedLength
.elementAti(jmpDest
)) {
2326 forwardedLength
.setElementAt(currentLen
, jmpDest
);
2339 int32_t stringLenOp
= fRXPat
->fCompiledPat
->elementAti(loc
);
2340 int32_t stringLen
= URX_VAL(stringLenOp
);
2341 U_ASSERT(URX_TYPE(stringLenOp
) == URX_STRING_LEN
);
2342 U_ASSERT(stringLenOp
>= 2);
2343 if (currentLen
== 0) {
2344 // Add the starting character of this string to the set of possible starting
2345 // characters for this pattern.
2346 int32_t stringStartIdx
= URX_VAL(op
);
2347 UChar32 c
= fRXPat
->fLiteralText
.char32At(stringStartIdx
);
2348 fRXPat
->fInitialChars
->add(c
);
2350 // Remember this string. After the entire pattern has been checked,
2351 // if nothing else is identified that can start a match, we'll use it.
2352 numInitialStrings
++;
2353 fRXPat
->fInitialStringIdx
= stringStartIdx
;
2354 fRXPat
->fInitialStringLen
= stringLen
;
2357 currentLen
+= stringLen
;
2364 // Case-insensitive string. Unlike exact-match strings, we won't
2365 // attempt a string search for possible match positions. But we
2366 // do update the set of possible starting characters.
2368 int32_t stringLenOp
= fRXPat
->fCompiledPat
->elementAti(loc
);
2369 int32_t stringLen
= URX_VAL(stringLenOp
);
2370 U_ASSERT(URX_TYPE(stringLenOp
) == URX_STRING_LEN
);
2371 U_ASSERT(stringLenOp
>= 2);
2372 if (currentLen
== 0) {
2373 // Add the starting character of this string to the set of possible starting
2374 // characters for this pattern.
2375 int32_t stringStartIdx
= URX_VAL(op
);
2376 UChar32 c
= fRXPat
->fLiteralText
.char32At(stringStartIdx
);
2378 s
.closeOver(USET_CASE
);
2379 fRXPat
->fInitialChars
->addAll(s
);
2380 numInitialStrings
+= 2; // Matching on an initial string not possible.
2382 currentLen
+= stringLen
;
2388 case URX_CTR_INIT_NG
:
2390 // Loop Init Ops. These don't change the min length, but they are 4 word ops
2391 // so location must be updated accordingly.
2393 // If the min loop count == 0
2394 // move loc forwards to the end of the loop, skipping over the body.
2395 // If the min count is > 0,
2396 // continue normal processing of the body of the loop.
2397 int32_t loopEndLoc
= fRXPat
->fCompiledPat
->elementAti(loc
+1);
2398 loopEndLoc
= URX_VAL(loopEndLoc
);
2399 int32_t minLoopCount
= fRXPat
->fCompiledPat
->elementAti(loc
+2);
2400 if (minLoopCount
== 0) {
2403 loc
+=3; // Skips over operands of CTR_INIT
2411 case URX_CTR_LOOP_NG
:
2413 // The jump is conditional, backwards only.
2418 // More loop ops. These state-save to themselves.
2419 // don't change the minimum match
2427 // Look-around. Scan forward until the matching look-ahead end,
2428 // without processing the look-around block. This is overly pessimistic.
2432 op
= fRXPat
->fCompiledPat
->elementAti(loc
);
2433 if (URX_TYPE(op
) == URX_LA_START
|| URX_TYPE(op
) == URX_LB_START
) {
2436 if (URX_TYPE(op
) == URX_LA_END
|| URX_TYPE(op
)==URX_LBN_END
) {
2442 if (URX_TYPE(op
) == URX_STATE_SAVE
) {
2443 // Need this because neg lookahead blocks will FAIL to outside
2445 int32_t jmpDest
= URX_VAL(op
);
2446 if (jmpDest
> loc
) {
2447 if (currentLen
< forwardedLength
.elementAti(jmpDest
)) {
2448 forwardedLength
.setElementAt(currentLen
, jmpDest
);
2452 U_ASSERT(loc
<= end
);
2462 U_ASSERT(FALSE
); // Shouldn't get here. These ops should be
2463 // consumed by the scan in URX_LA_START and LB_START
2474 // We have finished walking through the ops. Check whether some forward jump
2475 // propagated a shorter length to location end+1.
2476 if (forwardedLength
.elementAti(end
+1) < currentLen
) {
2477 currentLen
= forwardedLength
.elementAti(end
+1);
2481 fRXPat
->fInitialChars8
->init(fRXPat
->fInitialChars
);
2484 // Sort out what we should check for when looking for candidate match start positions.
2485 // In order of preference,
2486 // 1. Start of input text buffer.
2487 // 2. A literal string.
2488 // 3. Start of line in multi-line mode.
2489 // 4. A single literal character.
2490 // 5. A character from a set of characters.
2492 if (fRXPat
->fStartType
== START_START
) {
2493 // Match only at the start of an input text string.
2494 // start type is already set. We're done.
2495 } else if (numInitialStrings
== 1 && fRXPat
->fMinMatchLen
> 0) {
2496 // Match beginning only with a literal string.
2497 UChar32 c
= fRXPat
->fLiteralText
.char32At(fRXPat
->fInitialStringIdx
);
2498 U_ASSERT(fRXPat
->fInitialChars
->contains(c
));
2499 fRXPat
->fStartType
= START_STRING
;
2500 fRXPat
->fInitialChar
= c
;
2501 } else if (fRXPat
->fStartType
== START_LINE
) {
2502 // Match at start of line in Mulit-Line mode.
2503 // Nothing to do here; everything is already set.
2504 } else if (fRXPat
->fMinMatchLen
== 0) {
2505 // Zero length match possible. We could start anywhere.
2506 fRXPat
->fStartType
= START_NO_INFO
;
2507 } else if (fRXPat
->fInitialChars
->size() == 1) {
2508 // All matches begin with the same char.
2509 fRXPat
->fStartType
= START_CHAR
;
2510 fRXPat
->fInitialChar
= fRXPat
->fInitialChars
->charAt(0);
2511 U_ASSERT(fRXPat
->fInitialChar
!= (UChar32
)-1);
2512 } else if (fRXPat
->fInitialChars
->contains((UChar32
)0, (UChar32
)0x10ffff) == FALSE
&&
2513 fRXPat
->fMinMatchLen
> 0) {
2514 // Matches start with a set of character smaller than the set of all chars.
2515 fRXPat
->fStartType
= START_SET
;
2517 // Matches can start with anything
2518 fRXPat
->fStartType
= START_NO_INFO
;
2526 //----------------------------------------------------------------------------------------
2528 // minMatchLength Calculate the length of the shortest string that could
2529 // match the specified pattern.
2530 // Length is in 16 bit code units, not code points.
2532 // The calculated length may not be exact. The returned
2533 // value may be shorter than the actual minimum; it must
2536 // start and end are the range of p-code operations to be
2537 // examined. The endpoints are included in the range.
2539 //----------------------------------------------------------------------------------------
2540 int32_t RegexCompile::minMatchLength(int32_t start
, int32_t end
) {
2541 if (U_FAILURE(*fStatus
)) {
2545 U_ASSERT(start
<= end
);
2546 U_ASSERT(end
< fRXPat
->fCompiledPat
->size());
2552 int32_t currentLen
= 0;
2555 // forwardedLength is a vector holding minimum-match-length values that
2556 // are propagated forward in the pattern by JMP or STATE_SAVE operations.
2557 // It must be one longer than the pattern being checked because some ops
2558 // will jmp to a end-of-block+1 location from within a block, and we must
2559 // count those when checking the block.
2560 UVector32
forwardedLength(end
+2, *fStatus
);
2561 forwardedLength
.setSize(end
+2);
2562 for (loc
=start
; loc
<=end
+1; loc
++) {
2563 forwardedLength
.setElementAt(INT32_MAX
, loc
);
2566 for (loc
= start
; loc
<=end
; loc
++) {
2567 op
= fRXPat
->fCompiledPat
->elementAti(loc
);
2568 opType
= URX_TYPE(op
);
2570 // The loop is advancing linearly through the pattern.
2571 // If the op we are now at was the destination of a branch in the pattern,
2572 // and that path has a shorter minimum length than the current accumulated value,
2573 // replace the current accumulated value.
2574 U_ASSERT(currentLen
>=0 && currentLen
< INT32_MAX
);
2575 if (forwardedLength
.elementAti(loc
) < currentLen
) {
2576 currentLen
= forwardedLength
.elementAti(loc
);
2577 U_ASSERT(currentLen
>=0 && currentLen
< INT32_MAX
);
2581 // Ops that don't change the total length matched
2582 case URX_RESERVED_OP
:
2584 case URX_STRING_LEN
:
2586 case URX_START_CAPTURE
:
2587 case URX_END_CAPTURE
:
2588 case URX_BACKSLASH_B
:
2589 case URX_BACKSLASH_G
:
2590 case URX_BACKSLASH_Z
:
2593 case URX_RELOC_OPRND
:
2594 case URX_STO_INP_LOC
:
2598 case URX_BACKREF
: // BackRef. Must assume that it might be a zero length match
2601 case URX_STO_SP
: // Setup for atomic or possessive blocks. Doesn't change what can match.
2609 // Ops that match a minimum of one character (one or two 16 bit code units.)
2612 case URX_STATIC_SETREF
:
2613 case URX_STAT_SETREF_N
:
2615 case URX_BACKSLASH_D
:
2617 case URX_BACKSLASH_X
: // Grahpeme Cluster. Minimum is 1, max unbounded.
2618 case URX_DOTANY_ALL
: // . matches one or two.
2621 case URX_DOTANY_ALL_PL
:
2627 loc
++; // URX_JMPX has an extra operand, ignored here,
2628 // otherwise processed identically to URX_JMP.
2631 int32_t jmpDest
= URX_VAL(op
);
2632 if (jmpDest
< loc
) {
2633 // Loop of some kind. Can safely ignore, the worst that will happen
2634 // is that we understate the true minimum length
2635 currentLen
= forwardedLength
.elementAti(loc
+1);
2637 // Forward jump. Propagate the current min length to the target loc of the jump.
2638 U_ASSERT(jmpDest
<= end
+1);
2639 if (forwardedLength
.elementAti(jmpDest
) > currentLen
) {
2640 forwardedLength
.setElementAt(currentLen
, jmpDest
);
2648 // Fails are kind of like a branch, except that the min length was
2649 // propagated already, by the state save.
2650 currentLen
= forwardedLength
.elementAti(loc
+1);
2651 U_ASSERT(currentLen
>=0 && currentLen
< INT32_MAX
);
2656 case URX_STATE_SAVE
:
2658 // State Save, for forward jumps, propagate the current minimum.
2659 // of the state save.
2660 int32_t jmpDest
= URX_VAL(op
);
2661 if (jmpDest
> loc
) {
2662 if (currentLen
< forwardedLength
.elementAti(jmpDest
)) {
2663 forwardedLength
.setElementAt(currentLen
, jmpDest
);
2674 int32_t stringLenOp
= fRXPat
->fCompiledPat
->elementAti(loc
);
2675 currentLen
+= URX_VAL(stringLenOp
);
2681 case URX_CTR_INIT_NG
:
2684 // If the min loop count == 0
2685 // move loc forwards to the end of the loop, skipping over the body.
2686 // If the min count is > 0,
2687 // continue normal processing of the body of the loop.
2688 int32_t loopEndLoc
= fRXPat
->fCompiledPat
->elementAti(loc
+1);
2689 loopEndLoc
= URX_VAL(loopEndLoc
);
2690 int32_t minLoopCount
= fRXPat
->fCompiledPat
->elementAti(loc
+2);
2691 if (minLoopCount
== 0) {
2694 loc
+=3; // Skips over operands of CTR_INIT
2701 case URX_CTR_LOOP_NG
:
2703 // The jump is conditional, backwards only.
2707 case URX_LOOP_DOT_I
:
2709 // More loop ops. These state-save to themselves.
2710 // don't change the minimum match - could match nothing at all.
2717 // Look-around. Scan forward until the matching look-ahead end,
2718 // without processing the look-around block. This is overly pessimistic.
2719 // TODO: Positive lookahead could recursively do the block, then continue
2720 // with the longer of the block or the value coming in.
2724 op
= fRXPat
->fCompiledPat
->elementAti(loc
);
2725 if (URX_TYPE(op
) == URX_LA_START
|| URX_TYPE(op
) == URX_LB_START
) {
2728 if (URX_TYPE(op
) == URX_LA_END
|| URX_TYPE(op
)==URX_LBN_END
) {
2734 if (URX_TYPE(op
) == URX_STATE_SAVE
) {
2735 // Need this because neg lookahead blocks will FAIL to outside
2737 int32_t jmpDest
= URX_VAL(op
);
2738 if (jmpDest
> loc
) {
2739 if (currentLen
< forwardedLength
.elementAti(jmpDest
)) {
2740 forwardedLength
.setElementAt(currentLen
, jmpDest
);
2745 U_ASSERT(loc
<= end
);
2755 // Only come here if the matching URX_LA_START or URX_LB_START was not in the
2756 // range being sized, which happens when measuring size of look-behind blocks.
2765 // We have finished walking through the ops. Check whether some forward jump
2766 // propagated a shorter length to location end+1.
2767 if (forwardedLength
.elementAti(end
+1) < currentLen
) {
2768 currentLen
= forwardedLength
.elementAti(end
+1);
2769 U_ASSERT(currentLen
>=0 && currentLen
< INT32_MAX
);
2777 //----------------------------------------------------------------------------------------
2779 // maxMatchLength Calculate the length of the longest string that could
2780 // match the specified pattern.
2781 // Length is in 16 bit code units, not code points.
2783 // The calculated length may not be exact. The returned
2784 // value may be longer than the actual maximum; it must
2785 // never be shorter.
2787 //----------------------------------------------------------------------------------------
2788 int32_t RegexCompile::maxMatchLength(int32_t start
, int32_t end
) {
2789 if (U_FAILURE(*fStatus
)) {
2792 U_ASSERT(start
<= end
);
2793 U_ASSERT(end
< fRXPat
->fCompiledPat
->size());
2799 int32_t currentLen
= 0;
2800 UVector32
forwardedLength(end
+1, *fStatus
);
2801 forwardedLength
.setSize(end
+1);
2803 for (loc
=start
; loc
<=end
; loc
++) {
2804 forwardedLength
.setElementAt(0, loc
);
2807 for (loc
= start
; loc
<=end
; loc
++) {
2808 op
= fRXPat
->fCompiledPat
->elementAti(loc
);
2809 opType
= URX_TYPE(op
);
2811 // The loop is advancing linearly through the pattern.
2812 // If the op we are now at was the destination of a branch in the pattern,
2813 // and that path has a longer maximum length than the current accumulated value,
2814 // replace the current accumulated value.
2815 if (forwardedLength
.elementAti(loc
) > currentLen
) {
2816 currentLen
= forwardedLength
.elementAti(loc
);
2820 // Ops that don't change the total length matched
2821 case URX_RESERVED_OP
:
2823 case URX_STRING_LEN
:
2825 case URX_START_CAPTURE
:
2826 case URX_END_CAPTURE
:
2827 case URX_BACKSLASH_B
:
2828 case URX_BACKSLASH_G
:
2829 case URX_BACKSLASH_Z
:
2832 case URX_RELOC_OPRND
:
2833 case URX_STO_INP_LOC
:
2838 case URX_STO_SP
: // Setup for atomic or possessive blocks. Doesn't change what can match.
2848 // Ops that increase that cause an unbounded increase in the length
2849 // of a matched string, or that increase it a hard to characterize way.
2850 // Call the max length unbounded, and stop further checking.
2851 case URX_BACKREF
: // BackRef. Must assume that it might be a zero length match
2853 case URX_BACKSLASH_X
: // Grahpeme Cluster. Minimum is 1, max unbounded.
2855 case URX_DOTANY_ALL_PL
:
2856 currentLen
= INT32_MAX
;
2860 // Ops that match a max of one character (possibly two 16 bit code units.)
2862 case URX_STATIC_SETREF
:
2863 case URX_STAT_SETREF_N
:
2865 case URX_BACKSLASH_D
:
2867 case URX_DOTANY_ALL
:
2872 // Single literal character. Increase current max length by one or two,
2873 // depending on whether the char is in the supplementary range.
2876 if (URX_VAL(op
) > 0x10000) {
2888 int32_t jmpDest
= URX_VAL(op
);
2889 if (jmpDest
< loc
) {
2890 // Loop of some kind. Max match length is unbounded.
2891 currentLen
= INT32_MAX
;
2893 // Forward jump. Propagate the current min length to the target loc of the jump.
2894 if (forwardedLength
.elementAti(jmpDest
) < currentLen
) {
2895 forwardedLength
.setElementAt(currentLen
, jmpDest
);
2903 // Fails are kind of like a branch, except that the max length was
2904 // propagated already, by the state save.
2905 currentLen
= forwardedLength
.elementAti(loc
+1);
2909 case URX_STATE_SAVE
:
2911 // State Save, for forward jumps, propagate the current minimum.
2912 // of the state save.
2913 // For backwards jumps, they create a loop, maximum
2914 // match length is unbounded.
2915 int32_t jmpDest
= URX_VAL(op
);
2916 if (jmpDest
> loc
) {
2917 if (currentLen
> forwardedLength
.elementAti(jmpDest
)) {
2918 forwardedLength
.setElementAt(currentLen
, jmpDest
);
2921 currentLen
= INT32_MAX
;
2933 int32_t stringLenOp
= fRXPat
->fCompiledPat
->elementAti(loc
);
2934 currentLen
+= URX_VAL(stringLenOp
);
2940 case URX_CTR_INIT_NG
:
2942 case URX_CTR_LOOP_NG
:
2944 case URX_LOOP_DOT_I
:
2946 // For anything to do with loops, make the match length unbounded.
2947 // Note: INIT instructions are multi-word. Can ignore because
2948 // INT32_MAX length will stop the per-instruction loop.
2949 currentLen
= INT32_MAX
;
2956 // Look-ahead. Just ignore, treat the look-ahead block as if
2957 // it were normal pattern. Gives a too-long match length,
2958 // but good enough for now.
2961 // End of look-ahead ops should always be consumed by the processing at
2962 // the URX_LA_START op.
2968 // Look-behind. Scan forward until the matching look-around end,
2969 // without processing the look-behind block.
2973 op
= fRXPat
->fCompiledPat
->elementAti(loc
);
2974 if (URX_TYPE(op
) == URX_LA_START
|| URX_TYPE(op
) == URX_LB_START
) {
2977 if (URX_TYPE(op
) == URX_LA_END
|| URX_TYPE(op
)==URX_LBN_END
) {
2983 U_ASSERT(loc
< end
);
2993 if (currentLen
== INT32_MAX
) {
2994 // The maximum length is unbounded.
2995 // Stop further processing of the pattern.
3005 //----------------------------------------------------------------------------------------
3007 // stripNOPs Remove any NOP operations from the compiled pattern code.
3008 // Extra NOPs are inserted for some constructs during the initial
3009 // code generation to provide locations that may be patched later.
3010 // Many end up unneeded, and are removed by this function.
3012 //----------------------------------------------------------------------------------------
3013 void RegexCompile::stripNOPs() {
3015 if (U_FAILURE(*fStatus
)) {
3019 int32_t end
= fRXPat
->fCompiledPat
->size();
3020 UVector32
deltas(end
, *fStatus
);
3022 // Make a first pass over the code, computing the amount that things
3023 // will be offset at each location in the original code.
3026 for (loc
=0; loc
<end
; loc
++) {
3027 deltas
.addElement(d
, *fStatus
);
3028 int32_t op
= fRXPat
->fCompiledPat
->elementAti(loc
);
3029 if (URX_TYPE(op
) == URX_NOP
) {
3034 // Make a second pass over the code, removing the NOPs by moving following
3035 // code up, and patching operands that refer to code locations that
3036 // are being moved. The array of offsets from the first step is used
3037 // to compute the new operand values.
3040 for (src
=0; src
<end
; src
++) {
3041 int32_t op
= fRXPat
->fCompiledPat
->elementAti(src
);
3042 int32_t opType
= URX_TYPE(op
);
3047 case URX_STATE_SAVE
:
3050 case URX_CTR_LOOP_NG
:
3051 case URX_RELOC_OPRND
:
3055 // These are instructions with operands that refer to code locations.
3057 int32_t operandAddress
= URX_VAL(op
);
3058 U_ASSERT(operandAddress
>=0 && operandAddress
<deltas
.size());
3059 int32_t fixedOperandAddress
= operandAddress
- deltas
.elementAti(operandAddress
);
3060 op
= URX_BUILD(opType
, fixedOperandAddress
);
3061 fRXPat
->fCompiledPat
->setElementAt(op
, dst
);
3066 case URX_RESERVED_OP
:
3067 case URX_RESERVED_OP_N
:
3072 case URX_STRING_LEN
:
3073 case URX_START_CAPTURE
:
3074 case URX_END_CAPTURE
:
3075 case URX_STATIC_SETREF
:
3076 case URX_STAT_SETREF_N
:
3080 case URX_BACKSLASH_B
:
3081 case URX_BACKSLASH_G
:
3082 case URX_BACKSLASH_X
:
3083 case URX_BACKSLASH_Z
:
3084 case URX_DOTANY_ALL
:
3085 case URX_DOTANY_ALL_PL
:
3087 case URX_BACKSLASH_D
:
3091 case URX_CTR_INIT_NG
:
3095 case URX_STO_INP_LOC
:
3109 case URX_LOOP_DOT_I
:
3111 // These instructions are unaltered by the relocation.
3112 fRXPat
->fCompiledPat
->setElementAt(op
, dst
);
3117 // Some op is unaccounted for.
3119 error(U_REGEX_INTERNAL_ERROR
);
3123 fRXPat
->fCompiledPat
->setSize(dst
);
3129 //----------------------------------------------------------------------------------------
3131 // OptDotStar Optimize patterns that end with a '.*' or '.+' to
3132 // just advance the input to the end.
3134 // Transform this compiled sequence
3135 // [DOT_ANY | DOT_ANY_ALL]
3136 // JMP_SAV to previous instruction
3137 // [NOP | END_CAPTURE | DOLLAR | BACKSLASH_Z]*
3142 // [DOT_ANY_PL | DOT_ANY_ALL_PL]
3143 // [NOP | END_CAPTURE | DOLLAR | BACKSLASH_Z]*
3146 //----------------------------------------------------------------------------------------
3147 void RegexCompile::OptDotStar() {
3148 // Scan backwards in the pattern, looking for a JMP_SAV near the end.
3152 for (jmpLoc
=fRXPat
->fCompiledPat
->size(); jmpLoc
--;) {
3154 op
= fRXPat
->fCompiledPat
->elementAti(jmpLoc
);
3155 opType
= URX_TYPE(op
);
3161 case URX_END_CAPTURE
:
3164 case URX_BACKSLASH_Z
:
3165 // These ops may follow the JMP_SAV without preventing us from
3166 // doing this optimization.
3170 // Got a trailing JMP_SAV that's a candidate for optimization.
3174 // This optimization not possible.
3177 break; // from the for loop.
3180 // We found in URX_JMP_SAV near the end that is a candidate for optimizing.
3181 // Is the target address the previous instruction?
3182 // Is the previous instruction a flavor of URX_DOTANY
3183 int32_t loopTopLoc
= URX_VAL(op
);
3184 if (loopTopLoc
!= jmpLoc
-1) {
3188 int32_t oldOp
= fRXPat
->fCompiledPat
->elementAti(loopTopLoc
);
3189 int32_t oldOpType
= opType
= URX_TYPE(oldOp
);
3190 if (oldOpType
== URX_DOTANY
) {
3191 newOp
= URX_BUILD(URX_DOTANY_PL
, 0);
3193 else if (oldOpType
== URX_DOTANY_ALL
) {
3194 newOp
= URX_BUILD(URX_DOTANY_ALL_PL
, 0);
3196 return; // Sequence we were looking for isn't there.
3199 // Substitute the new instructions into the pattern.
3200 // The NOP will be removed in a later optimization step.
3201 fRXPat
->fCompiledPat
->setElementAt(URX_BUILD(URX_NOP
, 0), loopTopLoc
);
3202 fRXPat
->fCompiledPat
->setElementAt(newOp
, jmpLoc
);
3206 //----------------------------------------------------------------------------------------
3208 // Error Report a rule parse error.
3209 // Only report it if no previous error has been recorded.
3211 //----------------------------------------------------------------------------------------
3212 void RegexCompile::error(UErrorCode e
) {
3213 if (U_SUCCESS(*fStatus
)) {
3215 fParseErr
->line
= fLineNum
;
3216 fParseErr
->offset
= fCharNum
;
3218 // Fill in the context.
3219 // Note: extractBetween() pins supplied indicies to the string bounds.
3220 uprv_memset(fParseErr
->preContext
, 0, sizeof(fParseErr
->preContext
));
3221 uprv_memset(fParseErr
->postContext
, 0, sizeof(fParseErr
->postContext
));
3222 fRXPat
->fPattern
.extractBetween(fScanIndex
-U_PARSE_CONTEXT_LEN
+1, fScanIndex
,
3223 fParseErr
->preContext
, 0);
3224 fRXPat
->fPattern
.extractBetween(fScanIndex
, fScanIndex
+U_PARSE_CONTEXT_LEN
-1,
3225 fParseErr
->postContext
, 0);
3231 // Assorted Unicode character constants.
3232 // Numeric because there is no portable way to enter them as literals.
3235 static const UChar chCR
= 0x0d; // New lines, for terminating comments.
3236 static const UChar chLF
= 0x0a;
3237 static const UChar chNEL
= 0x85; // NEL newline variant
3238 static const UChar chLS
= 0x2028; // Unicode Line Separator
3239 static const UChar chApos
= 0x27; // single quote, for quoted chars.
3240 static const UChar chPound
= 0x23; // '#', introduces a comment.
3241 static const UChar chE
= 0x45; // 'E'
3242 static const UChar chBackSlash
= 0x5c; // '\' introduces a char escape
3243 static const UChar chLParen
= 0x28;
3244 static const UChar chRParen
= 0x29;
3245 static const UChar chLBracket
= 0x5b;
3246 static const UChar chRBracket
= 0x5d;
3247 static const UChar chRBrace
= 0x7d;
3248 static const UChar chUpperN
= 0x4E;
3249 static const UChar chLowerP
= 0x70;
3250 static const UChar chUpperP
= 0x50;
3253 //----------------------------------------------------------------------------------------
3255 // nextCharLL Low Level Next Char from the regex pattern.
3256 // Get a char from the string, keep track of input position
3257 // for error reporting.
3259 //----------------------------------------------------------------------------------------
3260 UChar32
RegexCompile::nextCharLL() {
3262 UnicodeString
&pattern
= fRXPat
->fPattern
;
3264 if (fPeekChar
!= -1) {
3269 if (fPatternLength
==0 || fNextIndex
>= fPatternLength
) {
3272 ch
= pattern
.char32At(fNextIndex
);
3273 fNextIndex
= pattern
.moveIndex32(fNextIndex
, 1);
3278 ch
== chLF
&& fLastChar
!= chCR
) {
3279 // Character is starting a new line. Bump up the line number, and
3280 // reset the column to 0.
3284 error(U_REGEX_RULE_SYNTAX
);
3289 // Character is not starting a new line. Except in the case of a
3290 // LF following a CR, increment the column position.
3299 //---------------------------------------------------------------------------------
3301 // peekCharLL Low Level Character Scanning, sneak a peek at the next
3302 // character without actually getting it.
3304 //---------------------------------------------------------------------------------
3305 UChar32
RegexCompile::peekCharLL() {
3306 if (fPeekChar
== -1) {
3307 fPeekChar
= nextCharLL();
3313 //---------------------------------------------------------------------------------
3315 // nextChar for pattern scanning. At this level, we handle stripping
3316 // out comments and processing some backslash character escapes.
3317 // The rest of the pattern grammar is handled at the next level up.
3319 //---------------------------------------------------------------------------------
3320 void RegexCompile::nextChar(RegexPatternChar
&c
) {
3322 fScanIndex
= fNextIndex
;
3323 c
.fChar
= nextCharLL();
3328 if ((c
.fChar
==chBackSlash
&& peekCharLL()==chE
) || c
.fChar
== (UChar32
)-1) {
3329 fQuoteMode
= FALSE
; // Exit quote mode,
3330 nextCharLL(); // discard the E
3331 nextChar(c
); // recurse to get the real next char
3334 else if (fInBackslashQuote
) {
3335 // The current character immediately follows a '\'
3336 // Don't check for any further escapes, just return it as-is.
3337 // Don't set c.fQuoted, because that would prevent the state machine from
3338 // dispatching on the character.
3339 fInBackslashQuote
= FALSE
;
3343 // We are not in a \Q quoted region \E of the source.
3345 if (fModeFlags
& UREGEX_COMMENTS
) {
3347 // We are in free-spacing and comments mode.
3348 // Scan through any white space and comments, until we
3349 // reach a significant character or the end of inut.
3351 if (c
.fChar
== (UChar32
)-1) {
3352 break; // End of Input
3354 if (c
.fChar
== chPound
&& fEOLComments
== TRUE
) {
3355 // Start of a comment. Consume the rest of it, until EOF or a new line
3357 c
.fChar
= nextCharLL();
3358 if (c
.fChar
== (UChar32
)-1 || // EOF
3367 if (uprv_isRuleWhiteSpace(c
.fChar
) == FALSE
) {
3370 c
.fChar
= nextCharLL();
3375 // check for backslash escaped characters.
3377 int32_t startX
= fNextIndex
; // start and end positions of the
3378 int32_t endX
= fNextIndex
; // sequence following the '\'
3379 if (c
.fChar
== chBackSlash
) {
3380 if (RegexStaticSets::gStaticSets
->fUnescapeCharSet
->contains(peekCharLL())) {
3382 // A '\' sequence that is handled by ICU's standard unescapeAt function.
3383 // Includes \uxxxx, \n, \r, many others.
3384 // Return the single equivalent character.
3386 nextCharLL(); // get & discard the peeked char.
3388 c
.fChar
= fRXPat
->fPattern
.unescapeAt(endX
);
3389 if (startX
== endX
) {
3390 error(U_REGEX_BAD_ESCAPE_SEQUENCE
);
3392 fCharNum
+= endX
- startX
;
3397 // We are in a '\' escape that will be handled by the state table scanner.
3398 // Just return the backslash, but remember that the following char is to
3399 // be taken literally. TODO: this is awkward, think about alternatives.
3400 fInBackslashQuote
= TRUE
;
3405 // re-enable # to end-of-line comments, in case they were disabled.
3406 // They are disabled by the parser upon seeing '(?', but this lasts for
3407 // the fetching of the next character only.
3408 fEOLComments
= TRUE
;
3410 // putc(c.fChar, stdout);
3415 //---------------------------------------------------------------------------------
3417 // scanSet Construct a UnicodeSet from the text at the current scan
3418 // position. Advance the scan position to the first character
3421 // The scan position is normally under the control of the state machine
3422 // that controls pattern parsing. UnicodeSets, however, are parsed by
3423 // the UnicodeSet constructor, not by the Regex pattern parser.
3425 //---------------------------------------------------------------------------------
3426 UnicodeSet
*RegexCompile::scanSet() {
3427 UnicodeSet
*uset
= NULL
;
3432 if (U_FAILURE(*fStatus
)) {
3436 pos
.setIndex(fScanIndex
);
3437 startPos
= fScanIndex
;
3438 UErrorCode localStatus
= U_ZERO_ERROR
;
3439 uint32_t usetFlags
= 0;
3440 if (fModeFlags
& UREGEX_CASE_INSENSITIVE
) {
3441 usetFlags
|= USET_CASE_INSENSITIVE
;
3443 if (fModeFlags
& UREGEX_COMMENTS
) {
3444 usetFlags
|= USET_IGNORE_SPACE
;
3447 uset
= new UnicodeSet(fRXPat
->fPattern
, pos
,
3448 usetFlags
, localStatus
);
3449 if (U_FAILURE(localStatus
)) {
3450 // TODO: Get more accurate position of the error from UnicodeSet's return info.
3451 // UnicodeSet appears to not be reporting correctly at this time.
3452 REGEX_SCAN_DEBUG_PRINTF( "UnicodeSet parse postion.ErrorIndex = %d\n", pos
.getIndex());
3458 // Advance the current scan postion over the UnicodeSet.
3459 // Don't just set fScanIndex because the line/char positions maintained
3460 // for error reporting would be thrown off.
3463 if (fNextIndex
>= i
) {
3473 //---------------------------------------------------------------------------------
3475 // scanProp Construct a UnicodeSet from the text at the current scan
3476 // position, which will be of the form \p{whaterver}
3478 // The scan position will be at the 'p' or 'P'. On return
3479 // the scan position should be just after the '}'
3481 // Return a UnicodeSet, constructed from the \P pattern,
3482 // or NULL if the pattern is invalid.
3484 //---------------------------------------------------------------------------------
3485 UnicodeSet
*RegexCompile::scanProp() {
3486 UnicodeSet
*uset
= NULL
;
3488 if (U_FAILURE(*fStatus
)) {
3492 U_ASSERT(fC
.fChar
== chLowerP
|| fC
.fChar
== chUpperP
|| fC
.fChar
== chUpperN
);
3494 // enclose the \p{property} from the regex pattern source in [brackets]
3495 UnicodeString setPattern
;
3496 setPattern
.append(chLBracket
);
3497 setPattern
.append(chBackSlash
);
3499 setPattern
.append(fC
.fChar
);
3500 if (fC
.fChar
== chRBrace
) {
3504 if (fC
.fChar
== -1) {
3505 // Hit the end of the input string without finding the closing '}'
3506 error(U_REGEX_PROPERTY_SYNTAX
);
3510 setPattern
.append(chRBracket
);
3512 uint32_t usetFlags
= 0;
3513 if (fModeFlags
& UREGEX_CASE_INSENSITIVE
) {
3514 usetFlags
|= USET_CASE_INSENSITIVE
;
3516 if (fModeFlags
& UREGEX_COMMENTS
) {
3517 usetFlags
|= USET_IGNORE_SPACE
;
3520 // Build the UnicodeSet from the set pattern we just built up in a string.
3521 uset
= new UnicodeSet(setPattern
, usetFlags
, *fStatus
);
3522 if (U_FAILURE(*fStatus
)) {
3527 nextChar(fC
); // Continue overall regex pattern processing with char after the '}'
3532 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS