5 // Copyright (C) 2002-2006 International Business Machines Corporation and others.
6 // All Rights Reserved.
8 // This file contains the ICU regular expression compiler, which is responsible
9 // for processing a regular expression pattern into the compiled form that
10 // is used by the match finding engine.
13 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
17 #include "unicode/unistr.h"
18 #include "unicode/uniset.h"
19 #include "unicode/uchar.h"
20 #include "unicode/uchriter.h"
21 #include "unicode/parsepos.h"
22 #include "unicode/parseerr.h"
23 #include "unicode/regex.h"
33 #include "regexcst.h" // Contains state table for the regex pattern parser.
34 // generated by a Perl script.
46 //------------------------------------------------------------------------------
50 //------------------------------------------------------------------------------
51 RegexCompile::RegexCompile(RegexPattern
*rxp
, UErrorCode
&status
) : fParenStack(status
)
62 fInBackslashQuote
= FALSE
;
63 fModeFlags
= fRXPat
->fFlags
| 0x80000000;
67 fMatchCloseParen
= -1;
70 if (U_SUCCESS(status
) && U_FAILURE(rxp
->fDeferredStatus
)) {
71 status
= rxp
->fDeferredStatus
;
77 //------------------------------------------------------------------------------
81 //------------------------------------------------------------------------------
82 RegexCompile::~RegexCompile() {
85 //------------------------------------------------------------------------------
87 // Compile regex pattern. The state machine for rexexp pattern parsing is here.
88 // The state tables are hand-written in the file regexcst.txt,
89 // and converted to the form used here by a perl
92 //------------------------------------------------------------------------------
93 void RegexCompile::compile(
94 const UnicodeString
&pat
, // Source pat to be compiled.
95 UParseError
&pp
, // Error position info
96 UErrorCode
&e
) // Error Code
101 fStack
[fStackPtr
] = 0;
103 if (U_FAILURE(*fStatus
)) {
107 // There should be no pattern stuff in the RegexPattern object. They can not be reused.
108 U_ASSERT(fRXPat
->fPattern
.length() == 0);
110 // Prepare the RegexPattern object to receive the compiled pattern.
111 // TODO: remove per-instance field, and just use globals directly. (But check perf)
112 fRXPat
->fPattern
= pat
;
113 fRXPat
->fStaticSets
= RegexStaticSets::gStaticSets
->fPropSets
;
114 fRXPat
->fStaticSets8
= RegexStaticSets::gStaticSets
->fPropSets8
;
117 // Initialize the pattern scanning state machine
118 fPatternLength
= pat
.length();
120 const RegexTableEl
*tableEl
;
121 nextChar(fC
); // Fetch the first char from the pattern string.
124 // Main loop for the regex pattern parsing state machine.
125 // Runs once per state transition.
126 // Each time through optionally performs, depending on the state table,
127 // - an advance to the the next pattern char
128 // - an action to be performed.
129 // - pushing or popping a state to/from the local state return stack.
130 // file regexcst.txt is the source for the state table. The logic behind
131 // recongizing the pattern syntax is there, not here.
134 // Bail out if anything has gone wrong.
135 // Regex pattern parsing stops on the first error encountered.
136 if (U_FAILURE(*fStatus
)) {
140 U_ASSERT(state
!= 0);
142 // Find the state table element that matches the input char from the pattern, or the
143 // class of the input character. Start with the first table row for this
144 // state, then linearly scan forward until we find a row that matches the
145 // character. The last row for each state always matches all characters, so
146 // the search will stop there, if not before.
148 tableEl
= &gRuleParseStateTable
[state
];
149 REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d) state=%s ",
150 fC
.fChar
, fLineNum
, fCharNum
, RegexStateNames
[state
]));
152 for (;;) { // loop through table rows belonging to this state, looking for one
153 // that matches the current input char.
154 REGEX_SCAN_DEBUG_PRINTF(("."));
155 if (tableEl
->fCharClass
< 127 && fC
.fQuoted
== FALSE
&& tableEl
->fCharClass
== fC
.fChar
) {
156 // Table row specified an individual character, not a set, and
157 // the input character is not quoted, and
158 // the input character matched it.
161 if (tableEl
->fCharClass
== 255) {
162 // Table row specified default, match anything character class.
165 if (tableEl
->fCharClass
== 254 && fC
.fQuoted
) {
166 // Table row specified "quoted" and the char was quoted.
169 if (tableEl
->fCharClass
== 253 && fC
.fChar
== (UChar32
)-1) {
170 // Table row specified eof and we hit eof on the input.
174 if (tableEl
->fCharClass
>= 128 && tableEl
->fCharClass
< 240 && // Table specs a char class &&
175 fC
.fQuoted
== FALSE
&& // char is not escaped &&
176 fC
.fChar
!= (UChar32
)-1) { // char is not EOF
177 UnicodeSet
*uniset
= RegexStaticSets::gStaticSets
->fRuleSets
[tableEl
->fCharClass
-128];
178 if (uniset
->contains(fC
.fChar
)) {
179 // Table row specified a character class, or set of characters,
180 // and the current char matches it.
185 // No match on this row, advance to the next row for this state,
188 REGEX_SCAN_DEBUG_PRINTF(("\n"));
191 // We've found the row of the state table that matches the current input
192 // character from the rules string.
193 // Perform any action specified by this row in the state table.
194 if (doParseActions((EParseAction
)tableEl
->fAction
) == FALSE
) {
195 // Break out of the state machine loop if the
196 // the action signalled some kind of error, or
197 // the action was to exit, occurs on normal end-of-rules-input.
201 if (tableEl
->fPushState
!= 0) {
203 if (fStackPtr
>= kStackSize
) {
204 error(U_REGEX_INTERNAL_ERROR
);
205 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
208 fStack
[fStackPtr
] = tableEl
->fPushState
;
212 // NextChar. This is where characters are actually fetched from the pattern.
213 // Happens under control of the 'n' tag in the state table.
215 if (tableEl
->fNextChar
) {
219 // Get the next state from the table entry, or from the
220 // state stack if the next state was specified as "pop".
221 if (tableEl
->fNextState
!= 255) {
222 state
= tableEl
->fNextState
;
224 state
= fStack
[fStackPtr
];
227 // state stack underflow
228 // This will occur if the user pattern has mis-matched parentheses,
229 // with extra close parens.
232 error(U_REGEX_MISMATCHED_PAREN
);
239 // The pattern has now been read and processed, and the compiled code generated.
242 // Back-reference fixup
245 for (loc
=0; loc
<fRXPat
->fCompiledPat
->size(); loc
++) {
246 int32_t op
= fRXPat
->fCompiledPat
->elementAti(loc
);
247 int32_t opType
= URX_TYPE(op
);
248 if (opType
== URX_BACKREF
|| opType
== URX_BACKREF_I
) {
249 int32_t where
= URX_VAL(op
);
250 if (where
> fRXPat
->fGroupMap
->size()) {
251 error(U_REGEX_INVALID_BACK_REF
);
254 where
= fRXPat
->fGroupMap
->elementAti(where
-1);
255 op
= URX_BUILD(opType
, where
);
256 fRXPat
->fCompiledPat
->setElementAt(op
, loc
);
262 // Compute the number of digits requried for the largest capture group number.
264 fRXPat
->fMaxCaptureDigits
= 1;
267 if (n
> fRXPat
->fGroupMap
->size()) {
270 fRXPat
->fMaxCaptureDigits
++;
275 // The pattern's fFrameSize so far has accumulated the requirements for
276 // storage for capture parentheses, counters, etc. that are encountered
277 // in the pattern. Add space for the two variables that are always
278 // present in the saved state: the input string position and the
279 // position in the compiled pattern.
281 fRXPat
->fFrameSize
+=2;
284 // Get bounds for the minimum and maximum length of a string that this
285 // pattern can match. Used to avoid looking for matches in strings that
288 fRXPat
->fMinMatchLen
= minMatchLength(3, fRXPat
->fCompiledPat
->size()-1);
291 // Optimization passes
298 // Set up fast latin-1 range sets
300 int32_t numSets
= fRXPat
->fSets
->size();
301 fRXPat
->fSets8
= new Regex8BitSet
[numSets
];
303 for (i
=0; i
<numSets
; i
++) {
304 UnicodeSet
*s
= (UnicodeSet
*)fRXPat
->fSets
->elementAt(i
);
305 fRXPat
->fSets8
[i
].init(s
);
314 //------------------------------------------------------------------------------
316 // doParseAction Do some action during regex pattern parsing.
317 // Called by the parse state machine.
319 // Generation of the match engine PCode happens here, or
320 // in functions called from the parse actions defined here.
323 //------------------------------------------------------------------------------
324 UBool
RegexCompile::doParseActions(EParseAction action
)
326 UBool returnVal
= TRUE
;
328 switch ((Regex_PatternParseAction
)action
) {
331 // Start of pattern compiles to:
332 //0 SAVE 2 Fall back to position of FAIL
334 //2 FAIL Stop if we ever reach here.
335 //3 NOP Dummy, so start of pattern looks the same as
336 // the start of an ( grouping.
337 //4 NOP Resreved, will be replaced by a save if there are
338 // OR | operators at the top level
339 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_STATE_SAVE
, 2), *fStatus
);
340 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_JMP
, 3), *fStatus
);
341 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_FAIL
, 0), *fStatus
);
343 // Standard open nonCapture paren action emits the two NOPs and
344 // sets up the paren stack frame.
345 doParseActions((EParseAction
)doOpenNonCaptureParen
);
349 // We've scanned to the end of the pattern
350 // The end of pattern compiles to:
352 // which will stop the runtime match engine.
353 // Encountering end of pattern also behaves like a close paren,
354 // and forces fixups of the State Save at the beginning of the compiled pattern
355 // and of any OR operations at the top level.
358 if (fParenStack
.size() > 0) {
359 // Missing close paren in pattern.
360 error(U_REGEX_MISMATCHED_PAREN
);
363 // add the END operation to the compiled pattern.
364 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_END
, 0), *fStatus
);
366 // Terminate the pattern compilation state machine.
373 // Scanning a '|', as in (A|B)
375 // Insert a SAVE operation at the start of the pattern section preceding
376 // this OR at this level. This SAVE will branch the match forward
377 // to the right hand side of the OR in the event that the left hand
378 // side fails to match and backtracks. Locate the position for the
379 // save from the location on the top of the parentheses stack.
380 int32_t savePosition
= fParenStack
.popi();
381 int32_t op
= fRXPat
->fCompiledPat
->elementAti(savePosition
);
382 U_ASSERT(URX_TYPE(op
) == URX_NOP
); // original contents of reserved location
383 op
= URX_BUILD(URX_STATE_SAVE
, fRXPat
->fCompiledPat
->size()+1);
384 fRXPat
->fCompiledPat
->setElementAt(op
, savePosition
);
386 // Append an JMP operation into the compiled pattern. The operand for
387 // the JMP will eventually be the location following the ')' for the
388 // group. This will be patched in later, when the ')' is encountered.
389 op
= URX_BUILD(URX_JMP
, 0);
390 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
392 // Push the position of the newly added JMP op onto the parentheses stack.
393 // This registers if for fixup when this block's close paren is encountered.
394 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
);
396 // Append a NOP to the compiled pattern. This is the slot reserved
397 // for a SAVE in the event that there is yet another '|' following
399 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_NOP
, 0), *fStatus
);
400 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
);
405 case doOpenCaptureParen
:
408 // - NOP, which later may be replaced by a save-state if the
409 // parenthesized group gets a * quantifier, followed by
410 // - START_CAPTURE n where n is stack frame offset to the capture group variables.
411 // - NOP, which may later be replaced by a save-state if there
412 // is an '|' alternation within the parens.
414 // Each capture group gets three slots in the save stack frame:
415 // 0: Capture Group start position (in input string being matched.)
416 // 1: Capture Group end positino.
417 // 2: Start of Match-in-progress.
418 // The first two locations are for a completed capture group, and are
419 // referred to by back references and the like.
420 // The third location stores the capture start position when an START_CAPTURE is
421 // encountered. This will be promoted to a completed capture when (and if) the corresponding
422 // END_CAPure is encountered.
424 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_NOP
, 0), *fStatus
);
425 int32_t varsLoc
= fRXPat
->fFrameSize
; // Reserve three slots in match stack frame.
426 fRXPat
->fFrameSize
+= 3;
427 int32_t cop
= URX_BUILD(URX_START_CAPTURE
, varsLoc
);
428 fRXPat
->fCompiledPat
->addElement(cop
, *fStatus
);
429 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_NOP
, 0), *fStatus
);
431 // On the Parentheses stack, start a new frame and add the postions
432 // of the two NOPs. Depending on what follows in the pattern, the
433 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
434 // address of the end of the parenthesized group.
435 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
436 fParenStack
.push(capturing
, *fStatus
); // Frame type.
437 fParenStack
.push(fRXPat
->fCompiledPat
->size()-3, *fStatus
); // The first NOP location
438 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP loc
440 // Save the mapping from group number to stack frame variable position.
441 fRXPat
->fGroupMap
->addElement(varsLoc
, *fStatus
);
445 case doOpenNonCaptureParen
:
446 // Open non-caputuring (grouping only) Paren.
448 // - NOP, which later may be replaced by a save-state if the
449 // parenthesized group gets a * quantifier, followed by
450 // - NOP, which may later be replaced by a save-state if there
451 // is an '|' alternation within the parens.
453 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_NOP
, 0), *fStatus
);
454 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_NOP
, 0), *fStatus
);
456 // On the Parentheses stack, start a new frame and add the postions
458 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
459 fParenStack
.push(plain
, *fStatus
); // Begin a new frame.
460 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The first NOP location
461 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP loc
466 case doOpenAtomicParen
:
467 // Open Atomic Paren. (?>
469 // - NOP, which later may be replaced if the parenthesized group
470 // has a quantifier, followed by
471 // - STO_SP save state stack position, so it can be restored at the ")"
472 // - NOP, which may later be replaced by a save-state if there
473 // is an '|' alternation within the parens.
475 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_NOP
, 0), *fStatus
);
476 int32_t varLoc
= fRXPat
->fDataSize
; // Reserve a data location for saving the
477 fRXPat
->fDataSize
+= 1; // state stack ptr.
478 int32_t stoOp
= URX_BUILD(URX_STO_SP
, varLoc
);
479 fRXPat
->fCompiledPat
->addElement(stoOp
, *fStatus
);
480 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_NOP
, 0), *fStatus
);
482 // On the Parentheses stack, start a new frame and add the postions
483 // of the two NOPs. Depending on what follows in the pattern, the
484 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
485 // address of the end of the parenthesized group.
486 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
487 fParenStack
.push(atomic
, *fStatus
); // Frame type.
488 fParenStack
.push(fRXPat
->fCompiledPat
->size()-3, *fStatus
); // The first NOP
489 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP
494 case doOpenLookAhead
:
495 // Positive Look-ahead (?= stuff )
497 // 1 START_LA dataLoc
498 // 2. NOP reserved for use by quantifiers on the block.
499 // Look-ahead can't have quantifiers, but paren stack
500 // compile time conventions require the slot anyhow.
501 // 3. NOP may be replaced if there is are '|' ops in the block.
502 // 4. code for parenthesized stuff.
505 // Two data slots are reserved, for saving the stack ptr and the input position.
507 int32_t dataLoc
= fRXPat
->fDataSize
;
508 fRXPat
->fDataSize
+= 2;
509 int32_t op
= URX_BUILD(URX_LA_START
, dataLoc
);
510 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
512 op
= URX_BUILD(URX_NOP
, 0);
513 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
514 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
516 // On the Parentheses stack, start a new frame and add the postions
518 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
519 fParenStack
.push(lookAhead
, *fStatus
); // Frame type.
520 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The first NOP location
521 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP location
525 case doOpenLookAheadNeg
:
526 // Negated Lookahead. (?! stuff )
528 // 1. START_LA dataloc
529 // 2. SAVE_STATE 7 // Fail within look-ahead block restores to this state,
530 // // which continues with the match.
531 // 3. NOP // Std. Open Paren sequence, for possible '|'
532 // 4. code for parenthesized stuff.
533 // 5. END_LA // Cut back stack, remove saved state from step 2.
534 // 6. FAIL // code in block succeeded, so neg. lookahead fails.
537 int32_t dataLoc
= fRXPat
->fDataSize
;
538 fRXPat
->fDataSize
+= 2;
539 int32_t op
= URX_BUILD(URX_LA_START
, dataLoc
);
540 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
542 op
= URX_BUILD(URX_STATE_SAVE
, 0); // dest address will be patched later.
543 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
545 op
= URX_BUILD(URX_NOP
, 0);
546 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
548 // On the Parentheses stack, start a new frame and add the postions
549 // of the StateSave and NOP.
550 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
551 fParenStack
.push( negLookAhead
, *fStatus
); // Frame type
552 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The STATE_SAVE location
553 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP location
555 // Instructions #5 and #6 will be added when the ')' is encountered.
559 case doOpenLookBehind
:
561 // Compile a (?<= look-behind open paren.
564 // 0 URX_LB_START dataLoc
565 // 1 URX_LB_CONT dataLoc
568 // 4 URX_NOP Standard '(' boilerplate.
569 // 5 URX_NOP Reserved slot for use with '|' ops within (block).
570 // 6 <code for LookBehind expression>
571 // 7 URX_LB_END dataLoc # Check match len, restore input len
572 // 8 URX_LA_END dataLoc # Restore stack, input pos
574 // Allocate a block of matcher data, to contain (when running a match)
575 // 0: Stack ptr on entry
576 // 1: Input Index on entry
577 // 2: Start index of match current match attempt.
578 // 3: Original Input String len.
580 // Allocate data space
581 int32_t dataLoc
= fRXPat
->fDataSize
;
582 fRXPat
->fDataSize
+= 4;
585 int32_t op
= URX_BUILD(URX_LB_START
, dataLoc
);
586 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
589 op
= URX_BUILD(URX_LB_CONT
, dataLoc
);
590 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
591 fRXPat
->fCompiledPat
->addElement(0, *fStatus
); // MinMatchLength. To be filled later.
592 fRXPat
->fCompiledPat
->addElement(0, *fStatus
); // MaxMatchLength. To be filled later.
595 op
= URX_BUILD(URX_NOP
, 0);
596 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
597 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
599 // On the Parentheses stack, start a new frame and add the postions
600 // of the URX_LB_CONT and the NOP.
601 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
602 fParenStack
.push(lookBehind
, *fStatus
); // Frame type
603 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The first NOP location
604 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The 2nd NOP location
606 // The final two instructions will be added when the ')' is encountered.
611 case doOpenLookBehindNeg
:
613 // Compile a (?<! negated look-behind open paren.
616 // 0 URX_LB_START dataLoc # Save entry stack, input len
617 // 1 URX_LBN_CONT dataLoc # Iterate possible match positions
621 // 5 URX_NOP Standard '(' boilerplate.
622 // 6 URX_NOP Reserved slot for use with '|' ops within (block).
623 // 7 <code for LookBehind expression>
624 // 8 URX_LBN_END dataLoc # Check match len, cause a FAIL
627 // Allocate a block of matcher data, to contain (when running a match)
628 // 0: Stack ptr on entry
629 // 1: Input Index on entry
630 // 2: Start index of match current match attempt.
631 // 3: Original Input String len.
633 // Allocate data space
634 int32_t dataLoc
= fRXPat
->fDataSize
;
635 fRXPat
->fDataSize
+= 4;
638 int32_t op
= URX_BUILD(URX_LB_START
, dataLoc
);
639 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
642 op
= URX_BUILD(URX_LBN_CONT
, dataLoc
);
643 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
644 fRXPat
->fCompiledPat
->addElement(0, *fStatus
); // MinMatchLength. To be filled later.
645 fRXPat
->fCompiledPat
->addElement(0, *fStatus
); // MaxMatchLength. To be filled later.
646 fRXPat
->fCompiledPat
->addElement(0, *fStatus
); // Continue Loc. To be filled later.
649 op
= URX_BUILD(URX_NOP
, 0);
650 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
651 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
653 // On the Parentheses stack, start a new frame and add the postions
654 // of the URX_LB_CONT and the NOP.
655 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
656 fParenStack
.push(lookBehindN
, *fStatus
); // Frame type
657 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The first NOP location
658 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The 2nd NOP location
660 // The final two instructions will be added when the ')' is encountered.
664 case doConditionalExpr
:
665 // Conditionals such as (?(1)a:b)
667 // Perl inline-condtionals. (?{perl code}a|b) We're not perl, no way to do them.
668 error(U_REGEX_UNIMPLEMENTED
);
674 if (fParenStack
.size() <= 0) {
675 // Extra close paren, or missing open paren.
676 error(U_REGEX_MISMATCHED_PAREN
);
684 case doBadOpenParenType
:
686 error(U_REGEX_RULE_SYNTAX
);
690 case doMismatchedParenErr
:
691 error(U_REGEX_MISMATCHED_PAREN
);
695 // Normal '+' compiles to
696 // 1. stuff to be repeated (already built)
700 // Or, if the item to be repeated can match a zero length string,
701 // 1. STO_INP_LOC data-loc
702 // 2. body of stuff to be repeated
707 // Or, if the item to be repeated is simple
708 // 1. Item to be repeated.
709 // 2. LOOP_SR_I set number (assuming repeated item is a set ref)
710 // 3. LOOP_C stack location
712 int32_t topLoc
= blockTopLoc(FALSE
); // location of item #1
715 // Check for simple constructs, which may get special optimized code.
716 if (topLoc
== fRXPat
->fCompiledPat
->size() - 1) {
717 int32_t repeatedOp
= fRXPat
->fCompiledPat
->elementAti(topLoc
);
719 if (URX_TYPE(repeatedOp
) == URX_SETREF
) {
720 // Emit optimized code for [char set]+
721 int32_t loopOpI
= URX_BUILD(URX_LOOP_SR_I
, URX_VAL(repeatedOp
));
722 fRXPat
->fCompiledPat
->addElement(loopOpI
, *fStatus
);
723 frameLoc
= fRXPat
->fFrameSize
;
724 fRXPat
->fFrameSize
++;
725 int32_t loopOpC
= URX_BUILD(URX_LOOP_C
, frameLoc
);
726 fRXPat
->fCompiledPat
->addElement(loopOpC
, *fStatus
);
730 if (URX_TYPE(repeatedOp
) == URX_DOTANY
||
731 URX_TYPE(repeatedOp
) == URX_DOTANY_ALL
) {
732 // Emit Optimized code for .+ operations.
733 int32_t loopOpI
= URX_BUILD(URX_LOOP_DOT_I
, 0);
734 if (URX_TYPE(repeatedOp
) == URX_DOTANY_ALL
) {
735 // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
738 fRXPat
->fCompiledPat
->addElement(loopOpI
, *fStatus
);
739 frameLoc
= fRXPat
->fFrameSize
;
740 fRXPat
->fFrameSize
++;
741 int32_t loopOpC
= URX_BUILD(URX_LOOP_C
, frameLoc
);
742 fRXPat
->fCompiledPat
->addElement(loopOpC
, *fStatus
);
750 // Check for minimum match length of zero, which requires
751 // extra loop-breaking code.
752 if (minMatchLength(topLoc
, fRXPat
->fCompiledPat
->size()-1) == 0) {
753 // Zero length match is possible.
754 // Emit the code sequence that can handle it.
756 frameLoc
= fRXPat
->fFrameSize
;
757 fRXPat
->fFrameSize
++;
759 int32_t op
= URX_BUILD(URX_STO_INP_LOC
, frameLoc
);
760 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
);
762 op
= URX_BUILD(URX_JMP_SAV_X
, topLoc
+1);
763 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
765 // Simpler code when the repeated body must match something non-empty
766 int32_t jmpOp
= URX_BUILD(URX_JMP_SAV
, topLoc
);
767 fRXPat
->fCompiledPat
->addElement(jmpOp
, *fStatus
);
773 // Non-greedy '+?' compiles to
774 // 1. stuff to be repeated (already built)
778 int32_t topLoc
= blockTopLoc(FALSE
);
779 int32_t saveStateOp
= URX_BUILD(URX_STATE_SAVE
, topLoc
);
780 fRXPat
->fCompiledPat
->addElement(saveStateOp
, *fStatus
);
786 // Normal (greedy) ? quantifier.
789 // 2. body of optional block
791 // Insert the state save into the compiled pattern, and we're done.
793 int32_t saveStateLoc
= blockTopLoc(TRUE
);
794 int32_t saveStateOp
= URX_BUILD(URX_STATE_SAVE
, fRXPat
->fCompiledPat
->size());
795 fRXPat
->fCompiledPat
->setElementAt(saveStateOp
, saveStateLoc
);
800 // Non-greedy ?? quantifier
803 // 2. body of optional block
807 // This code is less than ideal, with two jmps instead of one, because we can only
808 // insert one instruction at the top of the block being iterated.
810 int32_t jmp1_loc
= blockTopLoc(TRUE
);
811 int32_t jmp2_loc
= fRXPat
->fCompiledPat
->size();
813 int32_t jmp1_op
= URX_BUILD(URX_JMP
, jmp2_loc
+1);
814 fRXPat
->fCompiledPat
->setElementAt(jmp1_op
, jmp1_loc
);
816 int32_t jmp2_op
= URX_BUILD(URX_JMP
, jmp2_loc
+2);
817 fRXPat
->fCompiledPat
->addElement(jmp2_op
, *fStatus
);
819 int32_t save_op
= URX_BUILD(URX_STATE_SAVE
, jmp1_loc
+1);
820 fRXPat
->fCompiledPat
->addElement(save_op
, *fStatus
);
826 // Normal (greedy) * quantifier.
829 // 2. body of stuff being iterated over
833 // Or, if the body is a simple [Set],
834 // 1. LOOP_SR_I set number
835 // 2. LOOP_C stack location
838 // Or if this is a .*
839 // 1. LOOP_DOT_I (. matches all mode flag)
840 // 2. LOOP_C stack location
842 // Or, if the body can match a zero-length string, to inhibit infinite loops,
844 // 2. STO_INP_LOC data-loc
849 // location of item #1, the STATE_SAVE
850 int32_t topLoc
= blockTopLoc(FALSE
);
851 int32_t dataLoc
= -1;
853 // Check for simple *, where the construct being repeated
854 // compiled to single opcode, and might be optimizable.
855 if (topLoc
== fRXPat
->fCompiledPat
->size() - 1) {
856 int32_t repeatedOp
= fRXPat
->fCompiledPat
->elementAti(topLoc
);
858 if (URX_TYPE(repeatedOp
) == URX_SETREF
) {
859 // Emit optimized code for a [char set]*
860 int32_t loopOpI
= URX_BUILD(URX_LOOP_SR_I
, URX_VAL(repeatedOp
));
861 fRXPat
->fCompiledPat
->setElementAt(loopOpI
, topLoc
);
862 dataLoc
= fRXPat
->fFrameSize
;
863 fRXPat
->fFrameSize
++;
864 int32_t loopOpC
= URX_BUILD(URX_LOOP_C
, dataLoc
);
865 fRXPat
->fCompiledPat
->addElement(loopOpC
, *fStatus
);
869 if (URX_TYPE(repeatedOp
) == URX_DOTANY
||
870 URX_TYPE(repeatedOp
) == URX_DOTANY_ALL
) {
871 // Emit Optimized code for .* operations.
872 int32_t loopOpI
= URX_BUILD(URX_LOOP_DOT_I
, 0);
873 if (URX_TYPE(repeatedOp
) == URX_DOTANY_ALL
) {
874 // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
877 fRXPat
->fCompiledPat
->setElementAt(loopOpI
, topLoc
);
878 dataLoc
= fRXPat
->fFrameSize
;
879 fRXPat
->fFrameSize
++;
880 int32_t loopOpC
= URX_BUILD(URX_LOOP_C
, dataLoc
);
881 fRXPat
->fCompiledPat
->addElement(loopOpC
, *fStatus
);
886 // Emit general case code for this *
887 // The optimizations did not apply.
889 int32_t saveStateLoc
= blockTopLoc(TRUE
);
890 int32_t jmpOp
= URX_BUILD(URX_JMP_SAV
, saveStateLoc
+1);
892 // Check for minimum match length of zero, which requires
893 // extra loop-breaking code.
894 if (minMatchLength(saveStateLoc
, fRXPat
->fCompiledPat
->size()-1) == 0) {
895 insertOp(saveStateLoc
);
896 dataLoc
= fRXPat
->fFrameSize
;
897 fRXPat
->fFrameSize
++;
899 int32_t op
= URX_BUILD(URX_STO_INP_LOC
, dataLoc
);
900 fRXPat
->fCompiledPat
->setElementAt(op
, saveStateLoc
+1);
901 jmpOp
= URX_BUILD(URX_JMP_SAV_X
, saveStateLoc
+2);
904 // Locate the position in the compiled pattern where the match will continue
905 // after completing the *. (4 or 5 in the comment above)
906 int32_t continueLoc
= fRXPat
->fCompiledPat
->size()+1;
908 // Put together the save state op store it into the compiled code.
909 int32_t saveStateOp
= URX_BUILD(URX_STATE_SAVE
, continueLoc
);
910 fRXPat
->fCompiledPat
->setElementAt(saveStateOp
, saveStateLoc
);
912 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
913 fRXPat
->fCompiledPat
->addElement(jmpOp
, *fStatus
);
918 // Non-greedy *? quantifier
921 // 2. body of stuff being iterated over
925 int32_t jmpLoc
= blockTopLoc(TRUE
); // loc 1.
926 int32_t saveLoc
= fRXPat
->fCompiledPat
->size(); // loc 3.
927 int32_t jmpOp
= URX_BUILD(URX_JMP
, saveLoc
);
928 int32_t stateSaveOp
= URX_BUILD(URX_STATE_SAVE
, jmpLoc
+1);
929 fRXPat
->fCompiledPat
->setElementAt(jmpOp
, jmpLoc
);
930 fRXPat
->fCompiledPat
->addElement(stateSaveOp
, *fStatus
);
936 // The '{' opening an interval quantifier was just scanned.
937 // Init the counter varaiables that will accumulate the values as the digits
943 case doIntevalLowerDigit
:
944 // Scanned a digit from the lower value of an {lower,upper} interval
946 int32_t digitValue
= u_charDigitValue(fC
.fChar
);
947 U_ASSERT(digitValue
>= 0);
948 fIntervalLow
= fIntervalLow
*10 + digitValue
;
949 if (fIntervalLow
< 0) {
950 error(U_REGEX_NUMBER_TOO_BIG
);
955 case doIntervalUpperDigit
:
956 // Scanned a digit from the upper value of an {lower,upper} interval
958 if (fIntervalUpper
< 0) {
961 int32_t digitValue
= u_charDigitValue(fC
.fChar
);
962 U_ASSERT(digitValue
>= 0);
963 fIntervalUpper
= fIntervalUpper
*10 + digitValue
;
964 if (fIntervalUpper
< 0) {
965 error(U_REGEX_NUMBER_TOO_BIG
);
971 // Scanned a single value interval like {27}. Upper = Lower.
972 fIntervalUpper
= fIntervalLow
;
976 // Finished scanning a normal {lower,upper} interval. Generate the code for it.
977 if (compileInlineInterval() == FALSE
) {
978 compileInterval(URX_CTR_INIT
, URX_CTR_LOOP
);
982 case doPossessiveInterval
:
983 // Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it.
985 // Remember the loc for the top of the block being looped over.
986 // (Can not reserve a slot in the compiled pattern at this time, becuase
987 // compileInterval needs to reserve also, and blockTopLoc can only reserve
989 int32_t topLoc
= blockTopLoc(FALSE
);
991 // Produce normal looping code.
992 compileInterval(URX_CTR_INIT
, URX_CTR_LOOP
);
994 // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
995 // just as if the loop was inclosed in atomic parentheses.
997 // First the STO_SP before the start of the loop
999 int32_t varLoc
= fRXPat
->fDataSize
; // Reserve a data location for saving the
1000 fRXPat
->fDataSize
+= 1; // state stack ptr.
1001 int32_t op
= URX_BUILD(URX_STO_SP
, varLoc
);
1002 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
);
1004 int32_t loopOp
= fRXPat
->fCompiledPat
->popi();
1005 U_ASSERT(URX_TYPE(loopOp
) == URX_CTR_LOOP
&& URX_VAL(loopOp
) == topLoc
);
1006 loopOp
++; // point LoopOp after the just-inserted STO_SP
1007 fRXPat
->fCompiledPat
->push(loopOp
, *fStatus
);
1009 // Then the LD_SP after the end of the loop
1010 op
= URX_BUILD(URX_LD_SP
, varLoc
);
1011 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1017 // Finished scanning a non-greedy {lower,upper}? interval. Generate the code for it.
1018 compileInterval(URX_CTR_INIT_NG
, URX_CTR_LOOP_NG
);
1021 case doIntervalError
:
1022 error(U_REGEX_BAD_INTERVAL
);
1026 // We've just scanned a "normal" character from the pattern,
1027 literalChar(fC
.fChar
);
1033 // scanned a ".", match any single character.
1036 if (fModeFlags
& UREGEX_DOTALL
) {
1037 op
= URX_BUILD(URX_DOTANY_ALL
, 0);
1039 op
= URX_BUILD(URX_DOTANY
, 0);
1041 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1047 int32_t op
= (fModeFlags
& UREGEX_MULTILINE
)? URX_CARET_M
: URX_CARET
;
1048 fRXPat
->fCompiledPat
->addElement(URX_BUILD(op
, 0), *fStatus
);
1055 int32_t op
= (fModeFlags
& UREGEX_MULTILINE
)? URX_DOLLAR_M
: URX_DOLLAR
;
1056 fRXPat
->fCompiledPat
->addElement(URX_BUILD(op
, 0), *fStatus
);
1061 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_CARET
, 0), *fStatus
);
1066 #if UCONFIG_NO_BREAK_ITERATION==1
1067 if (fModeFlags
& UREGEX_UWORD
) {
1068 error(U_UNSUPPORTED_ERROR
);
1071 int32_t op
= (fModeFlags
& UREGEX_UWORD
)? URX_BACKSLASH_BU
: URX_BACKSLASH_B
;
1072 fRXPat
->fCompiledPat
->addElement(URX_BUILD(op
, 1), *fStatus
);
1078 #if UCONFIG_NO_BREAK_ITERATION==1
1079 if (fModeFlags
& UREGEX_UWORD
) {
1080 error(U_UNSUPPORTED_ERROR
);
1083 int32_t op
= (fModeFlags
& UREGEX_UWORD
)? URX_BACKSLASH_BU
: URX_BACKSLASH_B
;
1084 fRXPat
->fCompiledPat
->addElement(URX_BUILD(op
, 0), *fStatus
);
1089 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_BACKSLASH_D
, 1), *fStatus
);
1093 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_BACKSLASH_D
, 0), *fStatus
);
1097 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_BACKSLASH_G
, 0), *fStatus
);
1101 fRXPat
->fCompiledPat
->addElement(
1102 URX_BUILD(URX_STAT_SETREF_N
, URX_ISSPACE_SET
), *fStatus
);
1106 fRXPat
->fCompiledPat
->addElement(
1107 URX_BUILD(URX_STATIC_SETREF
, URX_ISSPACE_SET
), *fStatus
);
1111 fRXPat
->fCompiledPat
->addElement(
1112 URX_BUILD(URX_STAT_SETREF_N
, URX_ISWORD_SET
), *fStatus
);
1116 fRXPat
->fCompiledPat
->addElement(
1117 URX_BUILD(URX_STATIC_SETREF
, URX_ISWORD_SET
), *fStatus
);
1121 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_BACKSLASH_X
, 0), *fStatus
);
1126 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_DOLLAR
, 0), *fStatus
);
1130 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_BACKSLASH_Z
, 0), *fStatus
);
1134 error(U_REGEX_BAD_ESCAPE_SEQUENCE
);
1143 UnicodeSet
*theSet
= scanProp();
1149 case doScanUnicodeSet
:
1151 UnicodeSet
*theSet
= scanSet();
1156 case doEnterQuoteMode
:
1157 // Just scanned a \Q. Put character scanner into quote mode.
1162 // BackReference. Somewhat unusual in that the front-end can not completely parse
1163 // the regular expression, because the number of digits to be consumed
1164 // depends on the number of capture groups that have been defined. So
1165 // we have to do it here instead.
1167 int32_t numCaptureGroups
= fRXPat
->fGroupMap
->size();
1168 int32_t groupNum
= 0;
1169 UChar32 c
= fC
.fChar
;
1172 // Loop once per digit, for max allowed number of digits in a back reference.
1173 int32_t digit
= u_charDigitValue(c
);
1174 groupNum
= groupNum
* 10 + digit
;
1175 if (groupNum
>= numCaptureGroups
) {
1179 if (RegexStaticSets::gStaticSets
->fRuleDigits
->contains(c
) == FALSE
) {
1185 // Scan of the back reference in the source regexp is complete. Now generate
1186 // the compiled code for it.
1187 // Because capture groups can be forward-referenced by back-references,
1188 // we fill the operand with the capture group number. At the end
1189 // of compilation, it will be changed to the variable's location.
1190 U_ASSERT(groupNum
> 0);
1192 if (fModeFlags
& UREGEX_CASE_INSENSITIVE
) {
1193 op
= URX_BUILD(URX_BACKREF_I
, groupNum
);
1195 op
= URX_BUILD(URX_BACKREF
, groupNum
);
1197 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1202 case doPossessivePlus
:
1203 // Possessive ++ quantifier.
1206 // 2. body of stuff being iterated over
1212 // Note: TODO: This is pretty inefficient. A mass of saved state is built up
1213 // then unconditionally discarded. Perhaps introduce a new opcode
1217 int32_t topLoc
= blockTopLoc(TRUE
);
1218 int32_t stoLoc
= fRXPat
->fDataSize
;
1219 fRXPat
->fDataSize
++; // Reserve the data location for storing save stack ptr.
1220 int32_t op
= URX_BUILD(URX_STO_SP
, stoLoc
);
1221 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
);
1223 // Emit the STATE_SAVE
1224 op
= URX_BUILD(URX_STATE_SAVE
, fRXPat
->fCompiledPat
->size()+2);
1225 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1228 op
= URX_BUILD(URX_JMP
, topLoc
+1);
1229 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1232 op
= URX_BUILD(URX_LD_SP
, stoLoc
);
1233 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1237 case doPossessiveStar
:
1238 // Possessive *+ quantifier.
1242 // 3. body of stuff being iterated over
1246 // TODO: do something to cut back the state stack each time through the loop.
1248 // Reserve two slots at the top of the block.
1249 int32_t topLoc
= blockTopLoc(TRUE
);
1253 int32_t stoLoc
= fRXPat
->fDataSize
;
1254 fRXPat
->fDataSize
++; // Reserve the data location for storing save stack ptr.
1255 int32_t op
= URX_BUILD(URX_STO_SP
, stoLoc
);
1256 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
);
1258 // Emit the SAVE_STATE 5
1259 int32_t L7
= fRXPat
->fCompiledPat
->size()+1;
1260 op
= URX_BUILD(URX_STATE_SAVE
, L7
);
1261 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
+1);
1263 // Append the JMP operation.
1264 op
= URX_BUILD(URX_JMP
, topLoc
+1);
1265 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1267 // Emit the LD_SP loc
1268 op
= URX_BUILD(URX_LD_SP
, stoLoc
);
1269 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1273 case doPossessiveOpt
:
1274 // Possessive ?+ quantifier.
1278 // 3. body of optional block
1283 // Reserve two slots at the top of the block.
1284 int32_t topLoc
= blockTopLoc(TRUE
);
1288 int32_t stoLoc
= fRXPat
->fDataSize
;
1289 fRXPat
->fDataSize
++; // Reserve the data location for storing save stack ptr.
1290 int32_t op
= URX_BUILD(URX_STO_SP
, stoLoc
);
1291 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
);
1293 // Emit the SAVE_STATE
1294 int32_t continueLoc
= fRXPat
->fCompiledPat
->size()+1;
1295 op
= URX_BUILD(URX_STATE_SAVE
, continueLoc
);
1296 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
+1);
1299 op
= URX_BUILD(URX_LD_SP
, stoLoc
);
1300 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1305 case doBeginMatchMode
:
1306 fNewModeFlags
= fModeFlags
;
1307 fSetModeFlag
= TRUE
;
1310 case doMatchMode
: // (?i) and similar
1314 case 0x69: /* 'i' */ bit
= UREGEX_CASE_INSENSITIVE
; break;
1315 case 0x6d: /* 'm' */ bit
= UREGEX_MULTILINE
; break;
1316 case 0x73: /* 's' */ bit
= UREGEX_DOTALL
; break;
1317 case 0x77: /* 'w' */ bit
= UREGEX_UWORD
; break;
1318 case 0x78: /* 'x' */ bit
= UREGEX_COMMENTS
; break;
1319 case 0x2d: /* '-' */ fSetModeFlag
= FALSE
; break;
1321 U_ASSERT(FALSE
); // Should never happen. Other chars are filtered out
1325 fNewModeFlags
|= bit
;
1327 fNewModeFlags
&= ~bit
;
1332 case doSetMatchMode
:
1333 // We've got a (?i) or similar. The match mode is being changed, but
1334 // the change is not scoped to a parenthesized block.
1335 U_ASSERT(fNewModeFlags
< 0);
1336 fModeFlags
= fNewModeFlags
;
1338 // Prevent any string from spanning across the change of match mode.
1339 // Otherwise the pattern "abc(?i)def" would make a single string of "abcdef"
1344 case doMatchModeParen
:
1345 // We've got a (?i: or similar. Begin a parenthesized block, save old
1346 // mode flags so they can be restored at the close of the block.
1349 // - NOP, which later may be replaced by a save-state if the
1350 // parenthesized group gets a * quantifier, followed by
1351 // - NOP, which may later be replaced by a save-state if there
1352 // is an '|' alternation within the parens.
1354 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_NOP
, 0), *fStatus
);
1355 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_NOP
, 0), *fStatus
);
1357 // On the Parentheses stack, start a new frame and add the postions
1358 // of the two NOPs (a normal non-capturing () frame, except for the
1359 // saving of the orignal mode flags.)
1360 fParenStack
.push(fModeFlags
, *fStatus
);
1361 fParenStack
.push(flags
, *fStatus
); // Frame Marker
1362 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The first NOP
1363 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP
1365 // Set the current mode flags to the new values.
1366 U_ASSERT(fNewModeFlags
< 0);
1367 fModeFlags
= fNewModeFlags
;
1372 error(U_REGEX_INVALID_FLAG
);
1375 case doSuppressComments
:
1376 // We have just scanned a '(?'. We now need to prevent the character scanner from
1377 // treating a '#' as a to-the-end-of-line comment.
1378 // (This Perl compatibility just gets uglier and uglier to do...)
1379 fEOLComments
= FALSE
;
1386 error(U_REGEX_INTERNAL_ERROR
);
1390 if (U_FAILURE(*fStatus
)) {
1399 //------------------------------------------------------------------------------
1401 // literalChar We've encountered a literal character from the pattern,
1402 // or an escape sequence that reduces to a character.
1403 // Add it to the string containing all literal chars/strings from
1405 // If we are in a pattern string already, add the new char to it.
1406 // If we aren't in a pattern string, begin one now.
1408 //------------------------------------------------------------------------------
1409 void RegexCompile::literalChar(UChar32 c
) {
1410 int32_t op
; // An operation in the compiled pattern.
1412 int32_t patternLoc
; // A position in the compiled pattern.
1416 // If the last thing compiled into the pattern was not a literal char,
1417 // force this new literal char to begin a new string, and not append to the previous.
1418 op
= fRXPat
->fCompiledPat
->lastElementi();
1419 opType
= URX_TYPE(op
);
1420 if (!(opType
== URX_STRING_LEN
|| opType
== URX_ONECHAR
|| opType
== URX_ONECHAR_I
)) {
1424 if (fStringOpStart
== -1) {
1425 // First char of a string in the pattern.
1426 // Emit a OneChar op into the compiled pattern.
1429 // Also add it to the string pool, in case we get a second adjacent literal
1430 // and want to change form ONE_CHAR to STRING
1431 fStringOpStart
= fRXPat
->fLiteralText
.length();
1432 fRXPat
->fLiteralText
.append(c
);
1436 // We are adding onto an existing string
1437 fRXPat
->fLiteralText
.append(c
);
1439 op
= fRXPat
->fCompiledPat
->lastElementi();
1440 opType
= URX_TYPE(op
);
1441 U_ASSERT(opType
== URX_ONECHAR
|| opType
== URX_ONECHAR_I
|| opType
== URX_STRING_LEN
);
1443 // If the most recently emitted op is a URX_ONECHAR,
1444 if (opType
== URX_ONECHAR
|| opType
== URX_ONECHAR_I
) {
1445 if (U16_IS_TRAIL(c
) && U16_IS_LEAD(URX_VAL(op
))) {
1446 // The most recently emitted op is a ONECHAR that was the first half
1447 // of a surrogate pair. Update the ONECHAR's operand to be the
1448 // supplementary code point resulting from both halves of the pair.
1449 c
= U16_GET_SUPPLEMENTARY(URX_VAL(op
), c
);
1450 op
= URX_BUILD(opType
, c
);
1451 patternLoc
= fRXPat
->fCompiledPat
->size() - 1;
1452 fRXPat
->fCompiledPat
->setElementAt(op
, patternLoc
);
1456 // The most recently emitted op is a ONECHAR.
1457 // We've now received another adjacent char. Change the ONECHAR op
1459 if (fModeFlags
& UREGEX_CASE_INSENSITIVE
) {
1460 op
= URX_BUILD(URX_STRING_I
, fStringOpStart
);
1462 op
= URX_BUILD(URX_STRING
, fStringOpStart
);
1464 patternLoc
= fRXPat
->fCompiledPat
->size() - 1;
1465 fRXPat
->fCompiledPat
->setElementAt(op
, patternLoc
);
1466 op
= URX_BUILD(URX_STRING_LEN
, 0);
1467 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1470 // The pattern contains a URX_SRING / URX_STRING_LEN. Update the
1471 // string length to reflect the new char we just added to the string.
1472 stringLen
= fRXPat
->fLiteralText
.length() - fStringOpStart
;
1473 op
= URX_BUILD(URX_STRING_LEN
, stringLen
);
1474 patternLoc
= fRXPat
->fCompiledPat
->size() - 1;
1475 fRXPat
->fCompiledPat
->setElementAt(op
, patternLoc
);
1480 //------------------------------------------------------------------------------
1482 // emitONE_CHAR emit a ONE_CHAR op into the generated code.
1483 // Choose cased or uncased version, depending on the
1484 // match mode and whether the character itself is cased.
1486 //------------------------------------------------------------------------------
1487 void RegexCompile::emitONE_CHAR(UChar32 c
) {
1489 if ((fModeFlags
& UREGEX_CASE_INSENSITIVE
) &&
1490 u_hasBinaryProperty(c
, UCHAR_CASE_SENSITIVE
)) {
1491 // We have a cased character, and are in case insensitive matching mode.
1492 c
= u_foldCase(c
, U_FOLD_CASE_DEFAULT
);
1493 op
= URX_BUILD(URX_ONECHAR_I
, c
);
1495 // Uncased char, or case sensitive match mode.
1496 // Either way, just generate a literal compare of the char.
1497 op
= URX_BUILD(URX_ONECHAR
, c
);
1499 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1503 //------------------------------------------------------------------------------
1505 // fixLiterals When compiling something that can follow a literal
1506 // string in a pattern, we need to "fix" any preceding
1507 // string, which will cause any subsequent literals to
1508 // begin a new string, rather than appending to the
1511 // Optionally, split the last char of the string off into
1512 // a single "ONE_CHAR" operation, so that quantifiers can
1513 // apply to that char alone. Example: abc*
1514 // The * must apply to the 'c' only.
1516 //------------------------------------------------------------------------------
1517 void RegexCompile::fixLiterals(UBool split
) {
1518 int32_t stringStart
= fStringOpStart
; // start index of the current literal string
1519 int32_t op
; // An op from/for the compiled pattern.
1520 int32_t opType
; // An opcode type from the compiled pattern.
1521 int32_t stringLastCharIdx
;
1523 int32_t stringNextToLastCharIdx
;
1524 UChar32 nextToLastChar
;
1527 fStringOpStart
= -1;
1532 // Split: We need to ensure that the last item in the compiled pattern does
1533 // not refer to a literal string of more than one char. If it does,
1534 // separate the last char from the rest of the string.
1536 // If the last operation from the compiled pattern is not a string,
1537 // nothing needs to be done
1538 op
= fRXPat
->fCompiledPat
->lastElementi();
1539 opType
= URX_TYPE(op
);
1540 if (opType
!= URX_STRING_LEN
) {
1543 stringLen
= URX_VAL(op
);
1546 // Find the position of the last code point in the string (might be a surrogate pair)
1548 stringLastCharIdx
= fRXPat
->fLiteralText
.length();
1549 stringLastCharIdx
= fRXPat
->fLiteralText
.moveIndex32(stringLastCharIdx
, -1);
1550 lastChar
= fRXPat
->fLiteralText
.char32At(stringLastCharIdx
);
1552 // The string should always be at least two code points long, meaning that there
1553 // should be something before the last char position that we just found.
1554 U_ASSERT(stringLastCharIdx
> stringStart
);
1555 stringNextToLastCharIdx
= fRXPat
->fLiteralText
.moveIndex32(stringLastCharIdx
, -1);
1556 U_ASSERT(stringNextToLastCharIdx
>= stringStart
);
1557 nextToLastChar
= fRXPat
->fLiteralText
.char32At(stringNextToLastCharIdx
);
1559 if (stringNextToLastCharIdx
> stringStart
) {
1560 // The length of string remaining after removing one char is two or more.
1561 // Leave the string in the compiled pattern, shorten it by one char,
1562 // and append a URX_ONECHAR op for the last char.
1563 stringLen
-= (fRXPat
->fLiteralText
.length() - stringLastCharIdx
);
1564 op
= URX_BUILD(URX_STRING_LEN
, stringLen
);
1565 fRXPat
->fCompiledPat
->setElementAt(op
, fRXPat
->fCompiledPat
->size() -1);
1566 emitONE_CHAR(lastChar
);
1568 // The original string consisted of exactly two characters. Replace
1569 // the existing compiled URX_STRING/URX_STRING_LEN ops with a pair
1571 fRXPat
->fCompiledPat
->setSize(fRXPat
->fCompiledPat
->size() -2);
1572 emitONE_CHAR(nextToLastChar
);
1573 emitONE_CHAR(lastChar
);
1582 //------------------------------------------------------------------------------
1584 // insertOp() Insert a slot for a new opcode into the already
1585 // compiled pattern code.
1587 // Fill the slot with a NOP. Our caller will replace it
1588 // with what they really wanted.
1590 //------------------------------------------------------------------------------
1591 void RegexCompile::insertOp(int32_t where
) {
1592 UVector32
*code
= fRXPat
->fCompiledPat
;
1593 U_ASSERT(where
>0 && where
< code
->size());
1595 int32_t nop
= URX_BUILD(URX_NOP
, 0);
1596 code
->insertElementAt(nop
, where
, *fStatus
);
1598 // Walk through the pattern, looking for any ops with targets that
1599 // were moved down by the insert. Fix them.
1601 for (loc
=0; loc
<code
->size(); loc
++) {
1602 int32_t op
= code
->elementAti(loc
);
1603 int32_t opType
= URX_TYPE(op
);
1604 int32_t opValue
= URX_VAL(op
);
1605 if ((opType
== URX_JMP
||
1606 opType
== URX_JMPX
||
1607 opType
== URX_STATE_SAVE
||
1608 opType
== URX_CTR_LOOP
||
1609 opType
== URX_CTR_LOOP_NG
||
1610 opType
== URX_JMP_SAV
||
1611 opType
== URX_RELOC_OPRND
) && opValue
> where
) {
1612 // Target location for this opcode is after the insertion point and
1613 // needs to be incremented to adjust for the insertion.
1615 op
= URX_BUILD(opType
, opValue
);
1616 code
->setElementAt(op
, loc
);
1620 // Now fix up the parentheses stack. All positive values in it are locations in
1621 // the compiled pattern. (Negative values are frame boundaries, and don't need fixing.)
1622 for (loc
=0; loc
<fParenStack
.size(); loc
++) {
1623 int32_t x
= fParenStack
.elementAti(loc
);
1624 U_ASSERT(x
< code
->size());
1627 fParenStack
.setElementAt(x
, loc
);
1631 if (fMatchCloseParen
> where
) {
1634 if (fMatchOpenParen
> where
) {
1641 //------------------------------------------------------------------------------
1643 // blockTopLoc() Find or create a location in the compiled pattern
1644 // at the start of the operation or block that has
1645 // just been compiled. Needed when a quantifier (* or
1646 // whatever) appears, and we need to add an operation
1647 // at the start of the thing being quantified.
1649 // (Parenthesized Blocks) have a slot with a NOP that
1650 // is reserved for this purpose. .* or similar don't
1651 // and a slot needs to be added.
1653 // parameter reserveLoc : TRUE - ensure that there is space to add an opcode
1654 // at the returned location.
1655 // FALSE - just return the address,
1656 // do not reserve a location there.
1658 //------------------------------------------------------------------------------
1659 int32_t RegexCompile::blockTopLoc(UBool reserveLoc
) {
1661 if (fRXPat
->fCompiledPat
->size() == fMatchCloseParen
)
1663 // The item just processed is a parenthesized block.
1664 theLoc
= fMatchOpenParen
; // A slot is already reserved for us.
1665 U_ASSERT(theLoc
> 0);
1666 U_ASSERT(URX_TYPE(((uint32_t)fRXPat
->fCompiledPat
->elementAti(theLoc
))) == URX_NOP
);
1669 // Item just compiled is a single thing, a ".", or a single char, or a set reference.
1670 // No slot for STATE_SAVE was pre-reserved in the compiled code.
1671 // We need to make space now.
1672 fixLiterals(TRUE
); // If last item was a string, separate the last char.
1673 theLoc
= fRXPat
->fCompiledPat
->size()-1;
1675 /*int32_t opAtTheLoc = fRXPat->fCompiledPat->elementAti(theLoc);*/
1676 int32_t nop
= URX_BUILD(URX_NOP
, 0);
1677 fRXPat
->fCompiledPat
->insertElementAt(nop
, theLoc
, *fStatus
);
1685 //------------------------------------------------------------------------------
1687 // handleCloseParen When compiling a close paren, we need to go back
1688 // and fix up any JMP or SAVE operations within the
1689 // parenthesized block that need to target the end
1690 // of the block. The locations of these are kept on
1691 // the paretheses stack.
1693 // This function is called both when encountering a
1694 // real ) and at the end of the pattern.
1696 //------------------------------------------------------------------------------
1697 void RegexCompile::handleCloseParen() {
1700 if (fParenStack
.size() <= 0) {
1701 error(U_REGEX_MISMATCHED_PAREN
);
1705 // Force any literal chars that may follow the close paren to start a new string,
1706 // and not attach to any preceding it.
1709 // Fixup any operations within the just-closed parenthesized group
1710 // that need to reference the end of the (block).
1711 // (The first one popped from the stack is an unused slot for
1712 // alternation (OR) state save, but applying the fixup to it does no harm.)
1714 patIdx
= fParenStack
.popi();
1716 // value < 0 flags the start of the frame on the paren stack.
1719 U_ASSERT(patIdx
>0 && patIdx
<= fRXPat
->fCompiledPat
->size());
1720 patOp
= fRXPat
->fCompiledPat
->elementAti(patIdx
);
1721 U_ASSERT(URX_VAL(patOp
) == 0); // Branch target for JMP should not be set.
1722 patOp
|= fRXPat
->fCompiledPat
->size(); // Set it now.
1723 fRXPat
->fCompiledPat
->setElementAt(patOp
, patIdx
);
1724 fMatchOpenParen
= patIdx
;
1727 // At the close of any parenthesized block, restore the match mode flags to
1728 // the value they had at the open paren. Saved value is
1729 // at the top of the paren stack.
1730 fModeFlags
= fParenStack
.popi();
1731 U_ASSERT(fModeFlags
< 0);
1733 // DO any additional fixups, depending on the specific kind of
1734 // parentesized grouping this is
1739 // No additional fixups required.
1740 // (Grouping-only parentheses)
1743 // Capturing Parentheses.
1744 // Insert a End Capture op into the pattern.
1745 // The frame offset of the variables for this cg is obtained from the
1746 // start capture op and put it into the end-capture op.
1748 int32_t captureOp
= fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
+1);
1749 U_ASSERT(URX_TYPE(captureOp
) == URX_START_CAPTURE
);
1751 int32_t frameVarLocation
= URX_VAL(captureOp
);
1752 int32_t endCaptureOp
= URX_BUILD(URX_END_CAPTURE
, frameVarLocation
);
1753 fRXPat
->fCompiledPat
->addElement(endCaptureOp
, *fStatus
);
1757 // Atomic Parenthesis.
1758 // Insert a LD_SP operation to restore the state stack to the position
1759 // it was when the atomic parens were entered.
1761 int32_t stoOp
= fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
+1);
1762 U_ASSERT(URX_TYPE(stoOp
) == URX_STO_SP
);
1763 int32_t stoLoc
= URX_VAL(stoOp
);
1764 int32_t ldOp
= URX_BUILD(URX_LD_SP
, stoLoc
);
1765 fRXPat
->fCompiledPat
->addElement(ldOp
, *fStatus
);
1771 int32_t startOp
= fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
-1);
1772 U_ASSERT(URX_TYPE(startOp
) == URX_LA_START
);
1773 int32_t dataLoc
= URX_VAL(startOp
);
1774 int32_t op
= URX_BUILD(URX_LA_END
, dataLoc
);
1775 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1781 // See comment at doOpenLookAheadNeg
1782 int32_t startOp
= fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
-1);
1783 U_ASSERT(URX_TYPE(startOp
) == URX_LA_START
);
1784 int32_t dataLoc
= URX_VAL(startOp
);
1785 int32_t op
= URX_BUILD(URX_LA_END
, dataLoc
);
1786 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1787 op
= URX_BUILD(URX_FAIL
, 0);
1788 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1790 // Patch the URX_SAVE near the top of the block.
1791 int32_t saveOp
= fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
);
1792 U_ASSERT(URX_TYPE(saveOp
) == URX_STATE_SAVE
);
1793 int32_t dest
= fRXPat
->fCompiledPat
->size();
1794 saveOp
= URX_BUILD(URX_STATE_SAVE
, dest
);
1795 fRXPat
->fCompiledPat
->setElementAt(saveOp
, fMatchOpenParen
);
1801 // See comment at doOpenLookBehind.
1803 // Append the URX_LB_END and URX_LA_END to the compiled pattern.
1804 int32_t startOp
= fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
-4);
1805 U_ASSERT(URX_TYPE(startOp
) == URX_LB_START
);
1806 int32_t dataLoc
= URX_VAL(startOp
);
1807 int32_t op
= URX_BUILD(URX_LB_END
, dataLoc
);
1808 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1809 op
= URX_BUILD(URX_LA_END
, dataLoc
);
1810 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1812 // Determine the min and max bounds for the length of the
1813 // string that the pattern can match.
1814 // An unbounded upper limit is an error.
1815 int32_t patEnd
= fRXPat
->fCompiledPat
->size() - 1;
1816 int32_t minML
= minMatchLength(fMatchOpenParen
, patEnd
);
1817 int32_t maxML
= maxMatchLength(fMatchOpenParen
, patEnd
);
1818 if (maxML
== INT32_MAX
) {
1819 error(U_REGEX_LOOK_BEHIND_LIMIT
);
1822 U_ASSERT(minML
<= maxML
);
1824 // Insert the min and max match len bounds into the URX_LB_CONT op that
1825 // appears at the top of the look-behind block, at location fMatchOpenParen+1
1826 fRXPat
->fCompiledPat
->setElementAt(minML
, fMatchOpenParen
-2);
1827 fRXPat
->fCompiledPat
->setElementAt(maxML
, fMatchOpenParen
-1);
1836 // See comment at doOpenLookBehindNeg.
1838 // Append the URX_LBN_END to the compiled pattern.
1839 int32_t startOp
= fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
-5);
1840 U_ASSERT(URX_TYPE(startOp
) == URX_LB_START
);
1841 int32_t dataLoc
= URX_VAL(startOp
);
1842 int32_t op
= URX_BUILD(URX_LBN_END
, dataLoc
);
1843 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1845 // Determine the min and max bounds for the length of the
1846 // string that the pattern can match.
1847 // An unbounded upper limit is an error.
1848 int32_t patEnd
= fRXPat
->fCompiledPat
->size() - 1;
1849 int32_t minML
= minMatchLength(fMatchOpenParen
, patEnd
);
1850 int32_t maxML
= maxMatchLength(fMatchOpenParen
, patEnd
);
1851 if (maxML
== INT32_MAX
) {
1852 error(U_REGEX_LOOK_BEHIND_LIMIT
);
1855 U_ASSERT(minML
<= maxML
);
1857 // Insert the min and max match len bounds into the URX_LB_CONT op that
1858 // appears at the top of the look-behind block, at location fMatchOpenParen+1
1859 fRXPat
->fCompiledPat
->setElementAt(minML
, fMatchOpenParen
-3);
1860 fRXPat
->fCompiledPat
->setElementAt(maxML
, fMatchOpenParen
-2);
1862 // Insert the pattern location to continue at after a successful match
1863 // as the last operand of the URX_LBN_CONT
1864 op
= URX_BUILD(URX_RELOC_OPRND
, fRXPat
->fCompiledPat
->size());
1865 fRXPat
->fCompiledPat
->setElementAt(op
, fMatchOpenParen
-1);
1875 // remember the next location in the compiled pattern.
1876 // The compilation of Quantifiers will look at this to see whether its looping
1877 // over a parenthesized block or a single item
1878 fMatchCloseParen
= fRXPat
->fCompiledPat
->size();
1883 //------------------------------------------------------------------------------
1885 // compileSet Compile the pattern operations for a reference to a
1888 //------------------------------------------------------------------------------
1889 void RegexCompile::compileSet(UnicodeSet
*theSet
)
1891 if (theSet
== NULL
) {
1894 int32_t setSize
= theSet
->size();
1895 UChar32 firstSetChar
= theSet
->charAt(0);
1896 if (firstSetChar
== -1) {
1897 // Sets that contain only strings, but no individual chars,
1898 // will end up here.
1899 error(U_REGEX_SET_CONTAINS_STRING
);
1906 // Set of no elements. Always fails to match.
1907 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_BACKTRACK
, 0), *fStatus
);
1914 // The set contains only a single code point. Put it into
1915 // the compiled pattern as a single char operation rather
1916 // than a set, and discard the set itself.
1917 literalChar(firstSetChar
);
1924 // The set contains two or more chars. (the normal case)
1925 // Put it into the compiled pattern as a set.
1926 int32_t setNumber
= fRXPat
->fSets
->size();
1927 fRXPat
->fSets
->addElement(theSet
, *fStatus
);
1928 int32_t setOp
= URX_BUILD(URX_SETREF
, setNumber
);
1929 fRXPat
->fCompiledPat
->addElement(setOp
, *fStatus
);
1935 //------------------------------------------------------------------------------
1937 // compileInterval Generate the code for a {min, max} style interval quantifier.
1938 // Except for the specific opcodes used, the code is the same
1939 // for all three types (greedy, non-greedy, possessive) of
1940 // intervals. The opcodes are supplied as parameters.
1942 // The code for interval loops has this form:
1943 // 0 CTR_INIT counter loc (in stack frame)
1944 // 1 5 patt address of CTR_LOOP at bottom of block
1946 // 3 max count (-1 for unbounded)
1947 // 4 ... block to be iterated over
1951 //------------------------------------------------------------------------------
1952 void RegexCompile::compileInterval(int32_t InitOp
, int32_t LoopOp
)
1954 // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
1955 // four slots in the compiled code. Reserve them.
1956 int32_t topOfBlock
= blockTopLoc(TRUE
);
1957 insertOp(topOfBlock
);
1958 insertOp(topOfBlock
);
1959 insertOp(topOfBlock
);
1961 // The operands for the CTR_INIT opcode include the index in the matcher data
1962 // of the counter. Allocate it now.
1963 int32_t counterLoc
= fRXPat
->fFrameSize
;
1964 fRXPat
->fFrameSize
++;
1966 int32_t op
= URX_BUILD(InitOp
, counterLoc
);
1967 fRXPat
->fCompiledPat
->setElementAt(op
, topOfBlock
);
1969 // The second operand of CTR_INIT is the location following the end of the loop.
1970 // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
1971 // compilation of something later on causes the code to grow and the target
1972 // position to move.
1973 int32_t loopEnd
= fRXPat
->fCompiledPat
->size();
1974 op
= URX_BUILD(URX_RELOC_OPRND
, loopEnd
);
1975 fRXPat
->fCompiledPat
->setElementAt(op
, topOfBlock
+1);
1977 // Followed by the min and max counts.
1978 fRXPat
->fCompiledPat
->setElementAt(fIntervalLow
, topOfBlock
+2);
1979 fRXPat
->fCompiledPat
->setElementAt(fIntervalUpper
, topOfBlock
+3);
1981 // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op.
1982 // Goes at end of the block being looped over, so just append to the code so far.
1983 op
= URX_BUILD(LoopOp
, topOfBlock
);
1984 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1986 if ((fIntervalLow
& 0xff000000) != 0 ||
1987 fIntervalUpper
> 0 && (fIntervalUpper
& 0xff000000) != 0) {
1988 error(U_REGEX_NUMBER_TOO_BIG
);
1991 if (fIntervalLow
> fIntervalUpper
&& fIntervalUpper
!= -1) {
1992 error(U_REGEX_MAX_LT_MIN
);
1998 UBool
RegexCompile::compileInlineInterval() {
1999 if (fIntervalUpper
> 10 || fIntervalUpper
< fIntervalLow
) {
2000 // Too big to inline. Fail, which will cause looping code to be generated.
2001 // (Upper < Lower picks up unbounded upper and errors, both.)
2005 int32_t topOfBlock
= blockTopLoc(FALSE
);
2006 if (fIntervalUpper
== 0) {
2007 // Pathological case. Attempt no matches, as if the block doesn't exist.
2008 fRXPat
->fCompiledPat
->setSize(topOfBlock
);
2012 if (topOfBlock
!= fRXPat
->fCompiledPat
->size()-1 && fIntervalUpper
!= 1) {
2013 // The thing being repeated is not a single op, but some
2014 // more complex block. Do it as a loop, not inlines.
2015 // Note that things "repeated" a max of once are handled as inline, because
2016 // the one copy of the code already generated is just fine.
2020 // Pick up the opcode that is to be repeated
2022 int32_t op
= fRXPat
->fCompiledPat
->elementAti(topOfBlock
);
2024 // Compute the pattern location where the inline sequence
2025 // will end, and set up the state save op that will be needed.
2027 int32_t endOfSequenceLoc
= fRXPat
->fCompiledPat
->size()-1
2028 + fIntervalUpper
+ (fIntervalUpper
-fIntervalLow
);
2029 int32_t saveOp
= URX_BUILD(URX_STATE_SAVE
, endOfSequenceLoc
);
2030 if (fIntervalLow
== 0) {
2031 insertOp(topOfBlock
);
2032 fRXPat
->fCompiledPat
->setElementAt(saveOp
, topOfBlock
);
2037 // Loop, emitting the op for the thing being repeated each time.
2038 // Loop starts at 1 because one instance of the op already exists in the pattern,
2039 // it was put there when it was originally encountered.
2041 for (i
=1; i
<fIntervalUpper
; i
++ ) {
2042 if (i
== fIntervalLow
) {
2043 fRXPat
->fCompiledPat
->addElement(saveOp
, *fStatus
);
2045 if (i
> fIntervalLow
) {
2046 fRXPat
->fCompiledPat
->addElement(saveOp
, *fStatus
);
2048 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
2055 //------------------------------------------------------------------------------
2057 // matchStartType Determine how a match can start.
2058 // Used to optimize find() operations.
2060 // Operation is very similar to minMatchLength(). Walk the compiled
2061 // pattern, keeping an on-going minimum-match-length. For any
2062 // op where the min match coming in is zero, add that ops possible
2063 // starting matches to the possible starts for the overall pattern.
2065 //------------------------------------------------------------------------------
2066 void RegexCompile::matchStartType() {
2067 if (U_FAILURE(*fStatus
)) {
2072 int32_t loc
; // Location in the pattern of the current op being processed.
2073 int32_t op
; // The op being processed
2074 int32_t opType
; // The opcode type of the op
2075 int32_t currentLen
= 0; // Minimum length of a match to this point (loc) in the pattern
2076 int32_t numInitialStrings
= 0; // Number of strings encountered that could match at start.
2078 UBool atStart
= TRUE
; // True if no part of the pattern yet encountered
2079 // could have advanced the position in a match.
2080 // (Maximum match length so far == 0)
2082 // forwardedLength is a vector holding minimum-match-length values that
2083 // are propagated forward in the pattern by JMP or STATE_SAVE operations.
2084 // It must be one longer than the pattern being checked because some ops
2085 // will jmp to a end-of-block+1 location from within a block, and we must
2086 // count those when checking the block.
2087 int32_t end
= fRXPat
->fCompiledPat
->size();
2088 UVector32
forwardedLength(end
+1, *fStatus
);
2089 forwardedLength
.setSize(end
+1);
2090 for (loc
=3; loc
<end
; loc
++) {
2091 forwardedLength
.setElementAt(INT32_MAX
, loc
);
2094 for (loc
= 3; loc
<end
; loc
++) {
2095 op
= fRXPat
->fCompiledPat
->elementAti(loc
);
2096 opType
= URX_TYPE(op
);
2098 // The loop is advancing linearly through the pattern.
2099 // If the op we are now at was the destination of a branch in the pattern,
2100 // and that path has a shorter minimum length than the current accumulated value,
2101 // replace the current accumulated value.
2102 U_ASSERT(currentLen
>=0 && currentLen
< INT32_MAX
);
2103 if (forwardedLength
.elementAti(loc
) < currentLen
) {
2104 currentLen
= forwardedLength
.elementAti(loc
);
2105 U_ASSERT(currentLen
>=0 && currentLen
< INT32_MAX
);
2109 // Ops that don't change the total length matched
2110 case URX_RESERVED_OP
:
2112 case URX_STRING_LEN
:
2114 case URX_START_CAPTURE
:
2115 case URX_END_CAPTURE
:
2116 case URX_BACKSLASH_B
:
2117 case URX_BACKSLASH_BU
:
2118 case URX_BACKSLASH_G
:
2119 case URX_BACKSLASH_Z
:
2121 case URX_RELOC_OPRND
:
2122 case URX_STO_INP_LOC
:
2125 case URX_BACKREF
: // BackRef. Must assume that it might be a zero length match
2128 case URX_STO_SP
: // Setup for atomic or possessive blocks. Doesn't change what can match.
2134 fRXPat
->fStartType
= START_START
;
2140 fRXPat
->fStartType
= START_LINE
;
2145 if (currentLen
== 0) {
2146 // This character could appear at the start of a match.
2147 // Add it to the set of possible starting characters.
2148 fRXPat
->fInitialChars
->add(URX_VAL(op
));
2149 numInitialStrings
+= 2;
2157 if (currentLen
== 0) {
2158 int32_t sn
= URX_VAL(op
);
2159 U_ASSERT(sn
> 0 && sn
< fRXPat
->fSets
->size());
2160 const UnicodeSet
*s
= (UnicodeSet
*)fRXPat
->fSets
->elementAt(sn
);
2161 fRXPat
->fInitialChars
->addAll(*s
);
2162 numInitialStrings
+= 2;
2169 // [Set]*, like a SETREF, above, in what it can match,
2170 // but may not match at all, so currentLen is not incremented.
2171 if (currentLen
== 0) {
2172 int32_t sn
= URX_VAL(op
);
2173 U_ASSERT(sn
> 0 && sn
< fRXPat
->fSets
->size());
2174 const UnicodeSet
*s
= (UnicodeSet
*)fRXPat
->fSets
->elementAt(sn
);
2175 fRXPat
->fInitialChars
->addAll(*s
);
2176 numInitialStrings
+= 2;
2181 case URX_LOOP_DOT_I
:
2182 if (currentLen
== 0) {
2183 // .* at the start of a pattern.
2184 // Any character can begin the match.
2185 fRXPat
->fInitialChars
->clear();
2186 fRXPat
->fInitialChars
->complement();
2187 numInitialStrings
+= 2;
2193 case URX_STATIC_SETREF
:
2194 if (currentLen
== 0) {
2195 int32_t sn
= URX_VAL(op
);
2196 U_ASSERT(sn
>0 && sn
<URX_LAST_SET
);
2197 const UnicodeSet
*s
= fRXPat
->fStaticSets
[sn
];
2198 fRXPat
->fInitialChars
->addAll(*s
);
2199 numInitialStrings
+= 2;
2207 case URX_STAT_SETREF_N
:
2208 if (currentLen
== 0) {
2209 int32_t sn
= URX_VAL(op
);
2210 const UnicodeSet
*s
= fRXPat
->fStaticSets
[sn
];
2213 fRXPat
->fInitialChars
->addAll(sc
);
2214 numInitialStrings
+= 2;
2222 case URX_BACKSLASH_D
:
2224 if (currentLen
== 0) {
2226 s
.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK
, U_GC_ND_MASK
, *fStatus
);
2227 if (URX_VAL(op
) != 0) {
2230 fRXPat
->fInitialChars
->addAll(s
);
2231 numInitialStrings
+= 2;
2239 // Case Insensitive Single Character.
2240 if (currentLen
== 0) {
2241 UChar32 c
= URX_VAL(op
);
2242 if (u_hasBinaryProperty(c
, UCHAR_CASE_SENSITIVE
)) {
2243 // character may have distinct cased forms. Add all of them
2244 // to the set of possible starting match chars.
2246 s
.closeOver(USET_CASE_INSENSITIVE
);
2247 fRXPat
->fInitialChars
->addAll(s
);
2249 // Char has no case variants. Just add it as-is to the
2250 // set of possible starting chars.
2251 fRXPat
->fInitialChars
->add(c
);
2253 numInitialStrings
+= 2;
2260 case URX_BACKSLASH_X
: // Grahpeme Cluster. Minimum is 1, max unbounded.
2261 case URX_DOTANY_ALL
: // . matches one or two.
2263 case URX_DOTANY_ALL_PL
:
2265 if (currentLen
== 0) {
2266 // These constructs are all bad news when they appear at the start
2267 // of a match. Any character can begin the match.
2268 fRXPat
->fInitialChars
->clear();
2269 fRXPat
->fInitialChars
->complement();
2270 numInitialStrings
+= 2;
2278 loc
++; // Except for extra operand on URX_JMPX, same as URX_JMP.
2281 int32_t jmpDest
= URX_VAL(op
);
2282 if (jmpDest
< loc
) {
2283 // Loop of some kind. Can safely ignore, the worst that will happen
2284 // is that we understate the true minimum length
2285 currentLen
= forwardedLength
.elementAti(loc
+1);
2288 // Forward jump. Propagate the current min length to the target loc of the jump.
2289 U_ASSERT(jmpDest
<= end
+1);
2290 if (forwardedLength
.elementAti(jmpDest
) > currentLen
) {
2291 forwardedLength
.setElementAt(currentLen
, jmpDest
);
2300 // Combo of state save to the next loc, + jmp backwards.
2301 // Net effect on min. length computation is nothing.
2306 // Fails are kind of like a branch, except that the min length was
2307 // propagated already, by the state save.
2308 currentLen
= forwardedLength
.elementAti(loc
+1);
2313 case URX_STATE_SAVE
:
2315 // State Save, for forward jumps, propagate the current minimum.
2316 // of the state save.
2317 int32_t jmpDest
= URX_VAL(op
);
2318 if (jmpDest
> loc
) {
2319 if (currentLen
< forwardedLength
.elementAti(jmpDest
)) {
2320 forwardedLength
.setElementAt(currentLen
, jmpDest
);
2333 int32_t stringLenOp
= fRXPat
->fCompiledPat
->elementAti(loc
);
2334 int32_t stringLen
= URX_VAL(stringLenOp
);
2335 U_ASSERT(URX_TYPE(stringLenOp
) == URX_STRING_LEN
);
2336 U_ASSERT(stringLenOp
>= 2);
2337 if (currentLen
== 0) {
2338 // Add the starting character of this string to the set of possible starting
2339 // characters for this pattern.
2340 int32_t stringStartIdx
= URX_VAL(op
);
2341 UChar32 c
= fRXPat
->fLiteralText
.char32At(stringStartIdx
);
2342 fRXPat
->fInitialChars
->add(c
);
2344 // Remember this string. After the entire pattern has been checked,
2345 // if nothing else is identified that can start a match, we'll use it.
2346 numInitialStrings
++;
2347 fRXPat
->fInitialStringIdx
= stringStartIdx
;
2348 fRXPat
->fInitialStringLen
= stringLen
;
2351 currentLen
+= stringLen
;
2358 // Case-insensitive string. Unlike exact-match strings, we won't
2359 // attempt a string search for possible match positions. But we
2360 // do update the set of possible starting characters.
2362 int32_t stringLenOp
= fRXPat
->fCompiledPat
->elementAti(loc
);
2363 int32_t stringLen
= URX_VAL(stringLenOp
);
2364 U_ASSERT(URX_TYPE(stringLenOp
) == URX_STRING_LEN
);
2365 U_ASSERT(stringLenOp
>= 2);
2366 if (currentLen
== 0) {
2367 // Add the starting character of this string to the set of possible starting
2368 // characters for this pattern.
2369 int32_t stringStartIdx
= URX_VAL(op
);
2370 UChar32 c
= fRXPat
->fLiteralText
.char32At(stringStartIdx
);
2372 s
.closeOver(USET_CASE_INSENSITIVE
);
2373 fRXPat
->fInitialChars
->addAll(s
);
2374 numInitialStrings
+= 2; // Matching on an initial string not possible.
2376 currentLen
+= stringLen
;
2382 case URX_CTR_INIT_NG
:
2384 // Loop Init Ops. These don't change the min length, but they are 4 word ops
2385 // so location must be updated accordingly.
2387 // If the min loop count == 0
2388 // move loc forwards to the end of the loop, skipping over the body.
2389 // If the min count is > 0,
2390 // continue normal processing of the body of the loop.
2391 int32_t loopEndLoc
= fRXPat
->fCompiledPat
->elementAti(loc
+1);
2392 loopEndLoc
= URX_VAL(loopEndLoc
);
2393 int32_t minLoopCount
= fRXPat
->fCompiledPat
->elementAti(loc
+2);
2394 if (minLoopCount
== 0) {
2395 // Min Loop Count of 0, treat like a forward branch and
2396 // move the current minimum length up to the target
2397 // (end of loop) location.
2398 U_ASSERT(loopEndLoc
<= end
+1);
2399 if (forwardedLength
.elementAti(loopEndLoc
) > currentLen
) {
2400 forwardedLength
.setElementAt(currentLen
, loopEndLoc
);
2403 loc
+=3; // Skips over operands of CTR_INIT
2410 case URX_CTR_LOOP_NG
:
2412 // The jump is conditional, backwards only.
2417 // More loop ops. These state-save to themselves.
2418 // don't change the minimum match
2426 // Look-around. Scan forward until the matching look-ahead end,
2427 // without processing the look-around block. This is overly pessimistic.
2431 op
= fRXPat
->fCompiledPat
->elementAti(loc
);
2432 if (URX_TYPE(op
) == URX_LA_START
|| URX_TYPE(op
) == URX_LB_START
) {
2435 if (URX_TYPE(op
) == URX_LA_END
|| URX_TYPE(op
)==URX_LBN_END
) {
2441 if (URX_TYPE(op
) == URX_STATE_SAVE
) {
2442 // Need this because neg lookahead blocks will FAIL to outside
2444 int32_t jmpDest
= URX_VAL(op
);
2445 if (jmpDest
> loc
) {
2446 if (currentLen
< forwardedLength
.elementAti(jmpDest
)) {
2447 forwardedLength
.setElementAt(currentLen
, jmpDest
);
2451 U_ASSERT(loc
<= end
);
2461 U_ASSERT(FALSE
); // Shouldn't get here. These ops should be
2462 // consumed by the scan in URX_LA_START and LB_START
2473 // We have finished walking through the ops. Check whether some forward jump
2474 // propagated a shorter length to location end+1.
2475 if (forwardedLength
.elementAti(end
+1) < currentLen
) {
2476 currentLen
= forwardedLength
.elementAti(end
+1);
2480 fRXPat
->fInitialChars8
->init(fRXPat
->fInitialChars
);
2483 // Sort out what we should check for when looking for candidate match start positions.
2484 // In order of preference,
2485 // 1. Start of input text buffer.
2486 // 2. A literal string.
2487 // 3. Start of line in multi-line mode.
2488 // 4. A single literal character.
2489 // 5. A character from a set of characters.
2491 if (fRXPat
->fStartType
== START_START
) {
2492 // Match only at the start of an input text string.
2493 // start type is already set. We're done.
2494 } else if (numInitialStrings
== 1 && fRXPat
->fMinMatchLen
> 0) {
2495 // Match beginning only with a literal string.
2496 UChar32 c
= fRXPat
->fLiteralText
.char32At(fRXPat
->fInitialStringIdx
);
2497 U_ASSERT(fRXPat
->fInitialChars
->contains(c
));
2498 fRXPat
->fStartType
= START_STRING
;
2499 fRXPat
->fInitialChar
= c
;
2500 } else if (fRXPat
->fStartType
== START_LINE
) {
2501 // Match at start of line in Multi-Line mode.
2502 // Nothing to do here; everything is already set.
2503 } else if (fRXPat
->fMinMatchLen
== 0) {
2504 // Zero length match possible. We could start anywhere.
2505 fRXPat
->fStartType
= START_NO_INFO
;
2506 } else if (fRXPat
->fInitialChars
->size() == 1) {
2507 // All matches begin with the same char.
2508 fRXPat
->fStartType
= START_CHAR
;
2509 fRXPat
->fInitialChar
= fRXPat
->fInitialChars
->charAt(0);
2510 U_ASSERT(fRXPat
->fInitialChar
!= (UChar32
)-1);
2511 } else if (fRXPat
->fInitialChars
->contains((UChar32
)0, (UChar32
)0x10ffff) == FALSE
&&
2512 fRXPat
->fMinMatchLen
> 0) {
2513 // Matches start with a set of character smaller than the set of all chars.
2514 fRXPat
->fStartType
= START_SET
;
2516 // Matches can start with anything
2517 fRXPat
->fStartType
= START_NO_INFO
;
2525 //------------------------------------------------------------------------------
2527 // minMatchLength Calculate the length of the shortest string that could
2528 // match the specified pattern.
2529 // Length is in 16 bit code units, not code points.
2531 // The calculated length may not be exact. The returned
2532 // value may be shorter than the actual minimum; it must
2535 // start and end are the range of p-code operations to be
2536 // examined. The endpoints are included in the range.
2538 //------------------------------------------------------------------------------
2539 int32_t RegexCompile::minMatchLength(int32_t start
, int32_t end
) {
2540 if (U_FAILURE(*fStatus
)) {
2544 U_ASSERT(start
<= end
);
2545 U_ASSERT(end
< fRXPat
->fCompiledPat
->size());
2551 int32_t currentLen
= 0;
2554 // forwardedLength is a vector holding minimum-match-length values that
2555 // are propagated forward in the pattern by JMP or STATE_SAVE operations.
2556 // It must be one longer than the pattern being checked because some ops
2557 // will jmp to a end-of-block+1 location from within a block, and we must
2558 // count those when checking the block.
2559 UVector32
forwardedLength(end
+2, *fStatus
);
2560 forwardedLength
.setSize(end
+2);
2561 for (loc
=start
; loc
<=end
+1; loc
++) {
2562 forwardedLength
.setElementAt(INT32_MAX
, loc
);
2565 for (loc
= start
; loc
<=end
; loc
++) {
2566 op
= fRXPat
->fCompiledPat
->elementAti(loc
);
2567 opType
= URX_TYPE(op
);
2569 // The loop is advancing linearly through the pattern.
2570 // If the op we are now at was the destination of a branch in the pattern,
2571 // and that path has a shorter minimum length than the current accumulated value,
2572 // replace the current accumulated value.
2573 U_ASSERT(currentLen
>=0 && currentLen
< INT32_MAX
);
2574 if (forwardedLength
.elementAti(loc
) < currentLen
) {
2575 currentLen
= forwardedLength
.elementAti(loc
);
2576 U_ASSERT(currentLen
>=0 && currentLen
< INT32_MAX
);
2580 // Ops that don't change the total length matched
2581 case URX_RESERVED_OP
:
2583 case URX_STRING_LEN
:
2585 case URX_START_CAPTURE
:
2586 case URX_END_CAPTURE
:
2587 case URX_BACKSLASH_B
:
2588 case URX_BACKSLASH_BU
:
2589 case URX_BACKSLASH_G
:
2590 case URX_BACKSLASH_Z
:
2593 case URX_RELOC_OPRND
:
2594 case URX_STO_INP_LOC
:
2598 case URX_BACKREF
: // BackRef. Must assume that it might be a zero length match
2601 case URX_STO_SP
: // Setup for atomic or possessive blocks. Doesn't change what can match.
2609 // Ops that match a minimum of one character (one or two 16 bit code units.)
2612 case URX_STATIC_SETREF
:
2613 case URX_STAT_SETREF_N
:
2615 case URX_BACKSLASH_D
:
2617 case URX_BACKSLASH_X
: // Grahpeme Cluster. Minimum is 1, max unbounded.
2618 case URX_DOTANY_ALL
: // . matches one or two.
2621 case URX_DOTANY_ALL_PL
:
2627 loc
++; // URX_JMPX has an extra operand, ignored here,
2628 // otherwise processed identically to URX_JMP.
2631 int32_t jmpDest
= URX_VAL(op
);
2632 if (jmpDest
< loc
) {
2633 // Loop of some kind. Can safely ignore, the worst that will happen
2634 // is that we understate the true minimum length
2635 currentLen
= forwardedLength
.elementAti(loc
+1);
2637 // Forward jump. Propagate the current min length to the target loc of the jump.
2638 U_ASSERT(jmpDest
<= end
+1);
2639 if (forwardedLength
.elementAti(jmpDest
) > currentLen
) {
2640 forwardedLength
.setElementAt(currentLen
, jmpDest
);
2648 // Fails are kind of like a branch, except that the min length was
2649 // propagated already, by the state save.
2650 currentLen
= forwardedLength
.elementAti(loc
+1);
2651 U_ASSERT(currentLen
>=0 && currentLen
< INT32_MAX
);
2656 case URX_STATE_SAVE
:
2658 // State Save, for forward jumps, propagate the current minimum.
2659 // of the state save.
2660 int32_t jmpDest
= URX_VAL(op
);
2661 if (jmpDest
> loc
) {
2662 if (currentLen
< forwardedLength
.elementAti(jmpDest
)) {
2663 forwardedLength
.setElementAt(currentLen
, jmpDest
);
2674 int32_t stringLenOp
= fRXPat
->fCompiledPat
->elementAti(loc
);
2675 currentLen
+= URX_VAL(stringLenOp
);
2681 case URX_CTR_INIT_NG
:
2684 // If the min loop count == 0
2685 // move loc forwards to the end of the loop, skipping over the body.
2686 // If the min count is > 0,
2687 // continue normal processing of the body of the loop.
2688 int32_t loopEndLoc
= fRXPat
->fCompiledPat
->elementAti(loc
+1);
2689 loopEndLoc
= URX_VAL(loopEndLoc
);
2690 int32_t minLoopCount
= fRXPat
->fCompiledPat
->elementAti(loc
+2);
2691 if (minLoopCount
== 0) {
2694 loc
+=3; // Skips over operands of CTR_INIT
2701 case URX_CTR_LOOP_NG
:
2703 // The jump is conditional, backwards only.
2707 case URX_LOOP_DOT_I
:
2709 // More loop ops. These state-save to themselves.
2710 // don't change the minimum match - could match nothing at all.
2717 // Look-around. Scan forward until the matching look-ahead end,
2718 // without processing the look-around block. This is overly pessimistic.
2719 // TODO: Positive lookahead could recursively do the block, then continue
2720 // with the longer of the block or the value coming in.
2724 op
= fRXPat
->fCompiledPat
->elementAti(loc
);
2725 if (URX_TYPE(op
) == URX_LA_START
|| URX_TYPE(op
) == URX_LB_START
) {
2728 if (URX_TYPE(op
) == URX_LA_END
|| URX_TYPE(op
)==URX_LBN_END
) {
2734 if (URX_TYPE(op
) == URX_STATE_SAVE
) {
2735 // Need this because neg lookahead blocks will FAIL to outside
2737 int32_t jmpDest
= URX_VAL(op
);
2738 if (jmpDest
> loc
) {
2739 if (currentLen
< forwardedLength
.elementAti(jmpDest
)) {
2740 forwardedLength
.setElementAt(currentLen
, jmpDest
);
2745 U_ASSERT(loc
<= end
);
2755 // Only come here if the matching URX_LA_START or URX_LB_START was not in the
2756 // range being sized, which happens when measuring size of look-behind blocks.
2765 // We have finished walking through the ops. Check whether some forward jump
2766 // propagated a shorter length to location end+1.
2767 if (forwardedLength
.elementAti(end
+1) < currentLen
) {
2768 currentLen
= forwardedLength
.elementAti(end
+1);
2769 U_ASSERT(currentLen
>=0 && currentLen
< INT32_MAX
);
2777 //------------------------------------------------------------------------------
2779 // maxMatchLength Calculate the length of the longest string that could
2780 // match the specified pattern.
2781 // Length is in 16 bit code units, not code points.
2783 // The calculated length may not be exact. The returned
2784 // value may be longer than the actual maximum; it must
2785 // never be shorter.
2787 //------------------------------------------------------------------------------
2788 int32_t RegexCompile::maxMatchLength(int32_t start
, int32_t end
) {
2789 if (U_FAILURE(*fStatus
)) {
2792 U_ASSERT(start
<= end
);
2793 U_ASSERT(end
< fRXPat
->fCompiledPat
->size());
2799 int32_t currentLen
= 0;
2800 UVector32
forwardedLength(end
+1, *fStatus
);
2801 forwardedLength
.setSize(end
+1);
2803 for (loc
=start
; loc
<=end
; loc
++) {
2804 forwardedLength
.setElementAt(0, loc
);
2807 for (loc
= start
; loc
<=end
; loc
++) {
2808 op
= fRXPat
->fCompiledPat
->elementAti(loc
);
2809 opType
= URX_TYPE(op
);
2811 // The loop is advancing linearly through the pattern.
2812 // If the op we are now at was the destination of a branch in the pattern,
2813 // and that path has a longer maximum length than the current accumulated value,
2814 // replace the current accumulated value.
2815 if (forwardedLength
.elementAti(loc
) > currentLen
) {
2816 currentLen
= forwardedLength
.elementAti(loc
);
2820 // Ops that don't change the total length matched
2821 case URX_RESERVED_OP
:
2823 case URX_STRING_LEN
:
2825 case URX_START_CAPTURE
:
2826 case URX_END_CAPTURE
:
2827 case URX_BACKSLASH_B
:
2828 case URX_BACKSLASH_BU
:
2829 case URX_BACKSLASH_G
:
2830 case URX_BACKSLASH_Z
:
2833 case URX_RELOC_OPRND
:
2834 case URX_STO_INP_LOC
:
2839 case URX_STO_SP
: // Setup for atomic or possessive blocks. Doesn't change what can match.
2849 // Ops that increase that cause an unbounded increase in the length
2850 // of a matched string, or that increase it a hard to characterize way.
2851 // Call the max length unbounded, and stop further checking.
2852 case URX_BACKREF
: // BackRef. Must assume that it might be a zero length match
2854 case URX_BACKSLASH_X
: // Grahpeme Cluster. Minimum is 1, max unbounded.
2856 case URX_DOTANY_ALL_PL
:
2857 currentLen
= INT32_MAX
;
2861 // Ops that match a max of one character (possibly two 16 bit code units.)
2863 case URX_STATIC_SETREF
:
2864 case URX_STAT_SETREF_N
:
2866 case URX_BACKSLASH_D
:
2868 case URX_DOTANY_ALL
:
2873 // Single literal character. Increase current max length by one or two,
2874 // depending on whether the char is in the supplementary range.
2877 if (URX_VAL(op
) > 0x10000) {
2889 int32_t jmpDest
= URX_VAL(op
);
2890 if (jmpDest
< loc
) {
2891 // Loop of some kind. Max match length is unbounded.
2892 currentLen
= INT32_MAX
;
2894 // Forward jump. Propagate the current min length to the target loc of the jump.
2895 if (forwardedLength
.elementAti(jmpDest
) < currentLen
) {
2896 forwardedLength
.setElementAt(currentLen
, jmpDest
);
2904 // Fails are kind of like a branch, except that the max length was
2905 // propagated already, by the state save.
2906 currentLen
= forwardedLength
.elementAti(loc
+1);
2910 case URX_STATE_SAVE
:
2912 // State Save, for forward jumps, propagate the current minimum.
2913 // of the state save.
2914 // For backwards jumps, they create a loop, maximum
2915 // match length is unbounded.
2916 int32_t jmpDest
= URX_VAL(op
);
2917 if (jmpDest
> loc
) {
2918 if (currentLen
> forwardedLength
.elementAti(jmpDest
)) {
2919 forwardedLength
.setElementAt(currentLen
, jmpDest
);
2922 currentLen
= INT32_MAX
;
2934 int32_t stringLenOp
= fRXPat
->fCompiledPat
->elementAti(loc
);
2935 currentLen
+= URX_VAL(stringLenOp
);
2941 case URX_CTR_INIT_NG
:
2943 case URX_CTR_LOOP_NG
:
2945 case URX_LOOP_DOT_I
:
2947 // For anything to do with loops, make the match length unbounded.
2948 // Note: INIT instructions are multi-word. Can ignore because
2949 // INT32_MAX length will stop the per-instruction loop.
2950 currentLen
= INT32_MAX
;
2957 // Look-ahead. Just ignore, treat the look-ahead block as if
2958 // it were normal pattern. Gives a too-long match length,
2959 // but good enough for now.
2962 // End of look-ahead ops should always be consumed by the processing at
2963 // the URX_LA_START op.
2969 // Look-behind. Scan forward until the matching look-around end,
2970 // without processing the look-behind block.
2974 op
= fRXPat
->fCompiledPat
->elementAti(loc
);
2975 if (URX_TYPE(op
) == URX_LA_START
|| URX_TYPE(op
) == URX_LB_START
) {
2978 if (URX_TYPE(op
) == URX_LA_END
|| URX_TYPE(op
)==URX_LBN_END
) {
2984 U_ASSERT(loc
< end
);
2994 if (currentLen
== INT32_MAX
) {
2995 // The maximum length is unbounded.
2996 // Stop further processing of the pattern.
3006 //------------------------------------------------------------------------------
3008 // stripNOPs Remove any NOP operations from the compiled pattern code.
3009 // Extra NOPs are inserted for some constructs during the initial
3010 // code generation to provide locations that may be patched later.
3011 // Many end up unneeded, and are removed by this function.
3013 //------------------------------------------------------------------------------
3014 void RegexCompile::stripNOPs() {
3016 if (U_FAILURE(*fStatus
)) {
3020 int32_t end
= fRXPat
->fCompiledPat
->size();
3021 UVector32
deltas(end
, *fStatus
);
3023 // Make a first pass over the code, computing the amount that things
3024 // will be offset at each location in the original code.
3027 for (loc
=0; loc
<end
; loc
++) {
3028 deltas
.addElement(d
, *fStatus
);
3029 int32_t op
= fRXPat
->fCompiledPat
->elementAti(loc
);
3030 if (URX_TYPE(op
) == URX_NOP
) {
3035 // Make a second pass over the code, removing the NOPs by moving following
3036 // code up, and patching operands that refer to code locations that
3037 // are being moved. The array of offsets from the first step is used
3038 // to compute the new operand values.
3041 for (src
=0; src
<end
; src
++) {
3042 int32_t op
= fRXPat
->fCompiledPat
->elementAti(src
);
3043 int32_t opType
= URX_TYPE(op
);
3048 case URX_STATE_SAVE
:
3051 case URX_CTR_LOOP_NG
:
3052 case URX_RELOC_OPRND
:
3056 // These are instructions with operands that refer to code locations.
3058 int32_t operandAddress
= URX_VAL(op
);
3059 U_ASSERT(operandAddress
>=0 && operandAddress
<deltas
.size());
3060 int32_t fixedOperandAddress
= operandAddress
- deltas
.elementAti(operandAddress
);
3061 op
= URX_BUILD(opType
, fixedOperandAddress
);
3062 fRXPat
->fCompiledPat
->setElementAt(op
, dst
);
3067 case URX_RESERVED_OP
:
3068 case URX_RESERVED_OP_N
:
3073 case URX_STRING_LEN
:
3074 case URX_START_CAPTURE
:
3075 case URX_END_CAPTURE
:
3076 case URX_STATIC_SETREF
:
3077 case URX_STAT_SETREF_N
:
3081 case URX_BACKSLASH_B
:
3082 case URX_BACKSLASH_BU
:
3083 case URX_BACKSLASH_G
:
3084 case URX_BACKSLASH_X
:
3085 case URX_BACKSLASH_Z
:
3086 case URX_DOTANY_ALL
:
3087 case URX_DOTANY_ALL_PL
:
3089 case URX_BACKSLASH_D
:
3093 case URX_CTR_INIT_NG
:
3097 case URX_STO_INP_LOC
:
3111 case URX_LOOP_DOT_I
:
3113 // These instructions are unaltered by the relocation.
3114 fRXPat
->fCompiledPat
->setElementAt(op
, dst
);
3119 // Some op is unaccounted for.
3121 error(U_REGEX_INTERNAL_ERROR
);
3125 fRXPat
->fCompiledPat
->setSize(dst
);
3131 //------------------------------------------------------------------------------
3133 // OptDotStar Optimize patterns that end with a '.*' or '.+' to
3134 // just advance the input to the end.
3136 // Transform this compiled sequence
3137 // [DOT_ANY | DOT_ANY_ALL]
3138 // JMP_SAV to previous instruction
3139 // [NOP | END_CAPTURE | DOLLAR | BACKSLASH_Z]*
3144 // [DOT_ANY_PL | DOT_ANY_ALL_PL]
3145 // [NOP | END_CAPTURE | DOLLAR | BACKSLASH_Z]*
3148 //------------------------------------------------------------------------------
3149 void RegexCompile::OptDotStar() {
3150 // Scan backwards in the pattern, looking for a JMP_SAV near the end.
3154 for (jmpLoc
=fRXPat
->fCompiledPat
->size(); jmpLoc
--;) {
3156 op
= fRXPat
->fCompiledPat
->elementAti(jmpLoc
);
3157 opType
= URX_TYPE(op
);
3163 case URX_END_CAPTURE
:
3166 case URX_BACKSLASH_Z
:
3167 // These ops may follow the JMP_SAV without preventing us from
3168 // doing this optimization.
3172 // Got a trailing JMP_SAV that's a candidate for optimization.
3176 // This optimization not possible.
3179 break; // from the for loop.
3182 // We found in URX_JMP_SAV near the end that is a candidate for optimizing.
3183 // Is the target address the previous instruction?
3184 // Is the previous instruction a flavor of URX_DOTANY
3185 int32_t loopTopLoc
= URX_VAL(op
);
3186 if (loopTopLoc
!= jmpLoc
-1) {
3190 int32_t oldOp
= fRXPat
->fCompiledPat
->elementAti(loopTopLoc
);
3191 int32_t oldOpType
= opType
= URX_TYPE(oldOp
);
3192 if (oldOpType
== URX_DOTANY
) {
3193 newOp
= URX_BUILD(URX_DOTANY_PL
, 0);
3195 else if (oldOpType
== URX_DOTANY_ALL
) {
3196 newOp
= URX_BUILD(URX_DOTANY_ALL_PL
, 0);
3198 return; // Sequence we were looking for isn't there.
3201 // Substitute the new instructions into the pattern.
3202 // The NOP will be removed in a later optimization step.
3203 fRXPat
->fCompiledPat
->setElementAt(URX_BUILD(URX_NOP
, 0), loopTopLoc
);
3204 fRXPat
->fCompiledPat
->setElementAt(newOp
, jmpLoc
);
3208 //------------------------------------------------------------------------------
3210 // Error Report a rule parse error.
3211 // Only report it if no previous error has been recorded.
3213 //------------------------------------------------------------------------------
3214 void RegexCompile::error(UErrorCode e
) {
3215 if (U_SUCCESS(*fStatus
)) {
3217 fParseErr
->line
= fLineNum
;
3218 fParseErr
->offset
= fCharNum
;
3220 // Fill in the context.
3221 // Note: extractBetween() pins supplied indicies to the string bounds.
3222 uprv_memset(fParseErr
->preContext
, 0, sizeof(fParseErr
->preContext
));
3223 uprv_memset(fParseErr
->postContext
, 0, sizeof(fParseErr
->postContext
));
3224 fRXPat
->fPattern
.extractBetween(fScanIndex
-U_PARSE_CONTEXT_LEN
+1, fScanIndex
,
3225 fParseErr
->preContext
, 0);
3226 fRXPat
->fPattern
.extractBetween(fScanIndex
, fScanIndex
+U_PARSE_CONTEXT_LEN
-1,
3227 fParseErr
->postContext
, 0);
3233 // Assorted Unicode character constants.
3234 // Numeric because there is no portable way to enter them as literals.
3237 static const UChar chCR
= 0x0d; // New lines, for terminating comments.
3238 static const UChar chLF
= 0x0a;
3239 static const UChar chNEL
= 0x85; // NEL newline variant
3240 static const UChar chLS
= 0x2028; // Unicode Line Separator
3241 static const UChar chPound
= 0x23; // '#', introduces a comment.
3242 static const UChar chE
= 0x45; // 'E'
3243 static const UChar chUpperN
= 0x4E;
3244 static const UChar chLowerP
= 0x70;
3245 static const UChar chUpperP
= 0x50;
3246 static const UChar chBackSlash
= 0x5c; // '\' introduces a char escape
3247 static const UChar chLBracket
= 0x5b;
3248 static const UChar chRBracket
= 0x5d;
3249 static const UChar chRBrace
= 0x7d;
3252 //------------------------------------------------------------------------------
3254 // nextCharLL Low Level Next Char from the regex pattern.
3255 // Get a char from the string, keep track of input position
3256 // for error reporting.
3258 //------------------------------------------------------------------------------
3259 UChar32
RegexCompile::nextCharLL() {
3261 UnicodeString
&pattern
= fRXPat
->fPattern
;
3263 if (fPeekChar
!= -1) {
3268 if (fPatternLength
==0 || fNextIndex
>= fPatternLength
) {
3271 ch
= pattern
.char32At(fNextIndex
);
3272 fNextIndex
= pattern
.moveIndex32(fNextIndex
, 1);
3277 ch
== chLF
&& fLastChar
!= chCR
) {
3278 // Character is starting a new line. Bump up the line number, and
3279 // reset the column to 0.
3283 error(U_REGEX_RULE_SYNTAX
);
3288 // Character is not starting a new line. Except in the case of a
3289 // LF following a CR, increment the column position.
3298 //------------------------------------------------------------------------------
3300 // peekCharLL Low Level Character Scanning, sneak a peek at the next
3301 // character without actually getting it.
3303 //------------------------------------------------------------------------------
3304 UChar32
RegexCompile::peekCharLL() {
3305 if (fPeekChar
== -1) {
3306 fPeekChar
= nextCharLL();
3312 //------------------------------------------------------------------------------
3314 // nextChar for pattern scanning. At this level, we handle stripping
3315 // out comments and processing some backslash character escapes.
3316 // The rest of the pattern grammar is handled at the next level up.
3318 //------------------------------------------------------------------------------
3319 void RegexCompile::nextChar(RegexPatternChar
&c
) {
3321 fScanIndex
= fNextIndex
;
3322 c
.fChar
= nextCharLL();
3327 if ((c
.fChar
==chBackSlash
&& peekCharLL()==chE
) || c
.fChar
== (UChar32
)-1) {
3328 fQuoteMode
= FALSE
; // Exit quote mode,
3329 nextCharLL(); // discard the E
3330 nextChar(c
); // recurse to get the real next char
3333 else if (fInBackslashQuote
) {
3334 // The current character immediately follows a '\'
3335 // Don't check for any further escapes, just return it as-is.
3336 // Don't set c.fQuoted, because that would prevent the state machine from
3337 // dispatching on the character.
3338 fInBackslashQuote
= FALSE
;
3342 // We are not in a \Q quoted region \E of the source.
3344 if (fModeFlags
& UREGEX_COMMENTS
) {
3346 // We are in free-spacing and comments mode.
3347 // Scan through any white space and comments, until we
3348 // reach a significant character or the end of inut.
3350 if (c
.fChar
== (UChar32
)-1) {
3351 break; // End of Input
3353 if (c
.fChar
== chPound
&& fEOLComments
== TRUE
) {
3354 // Start of a comment. Consume the rest of it, until EOF or a new line
3356 c
.fChar
= nextCharLL();
3357 if (c
.fChar
== (UChar32
)-1 || // EOF
3366 if (uprv_isRuleWhiteSpace(c
.fChar
) == FALSE
) {
3369 c
.fChar
= nextCharLL();
3374 // check for backslash escaped characters.
3376 int32_t startX
= fNextIndex
; // start and end positions of the
3377 int32_t endX
= fNextIndex
; // sequence following the '\'
3378 if (c
.fChar
== chBackSlash
) {
3379 if (RegexStaticSets::gStaticSets
->fUnescapeCharSet
->contains(peekCharLL())) {
3381 // A '\' sequence that is handled by ICU's standard unescapeAt function.
3382 // Includes \uxxxx, \n, \r, many others.
3383 // Return the single equivalent character.
3385 nextCharLL(); // get & discard the peeked char.
3387 c
.fChar
= fRXPat
->fPattern
.unescapeAt(endX
);
3388 if (startX
== endX
) {
3389 error(U_REGEX_BAD_ESCAPE_SEQUENCE
);
3391 fCharNum
+= endX
- startX
;
3396 // We are in a '\' escape that will be handled by the state table scanner.
3397 // Just return the backslash, but remember that the following char is to
3398 // be taken literally. TODO: this is awkward, think about alternatives.
3399 fInBackslashQuote
= TRUE
;
3404 // re-enable # to end-of-line comments, in case they were disabled.
3405 // They are disabled by the parser upon seeing '(?', but this lasts for
3406 // the fetching of the next character only.
3407 fEOLComments
= TRUE
;
3409 // putc(c.fChar, stdout);
3414 //------------------------------------------------------------------------------
3416 // scanSet Construct a UnicodeSet from the text at the current scan
3417 // position. Advance the scan position to the first character
3420 // The scan position is normally under the control of the state machine
3421 // that controls pattern parsing. UnicodeSets, however, are parsed by
3422 // the UnicodeSet constructor, not by the Regex pattern parser.
3424 //------------------------------------------------------------------------------
3425 UnicodeSet
*RegexCompile::scanSet() {
3426 UnicodeSet
*uset
= NULL
;
3430 if (U_FAILURE(*fStatus
)) {
3434 pos
.setIndex(fScanIndex
);
3435 UErrorCode localStatus
= U_ZERO_ERROR
;
3436 uint32_t usetFlags
= 0;
3437 if (fModeFlags
& UREGEX_CASE_INSENSITIVE
) {
3438 usetFlags
|= USET_CASE_INSENSITIVE
;
3440 if (fModeFlags
& UREGEX_COMMENTS
) {
3441 usetFlags
|= USET_IGNORE_SPACE
;
3444 uset
= new UnicodeSet(fRXPat
->fPattern
, pos
,
3445 usetFlags
, NULL
, localStatus
);
3446 if (U_FAILURE(localStatus
)) {
3447 // TODO: Get more accurate position of the error from UnicodeSet's return info.
3448 // UnicodeSet appears to not be reporting correctly at this time.
3449 REGEX_SCAN_DEBUG_PRINTF(("UnicodeSet parse postion.ErrorIndex = %d\n", pos
.getIndex()));
3455 // Advance the current scan postion over the UnicodeSet.
3456 // Don't just set fScanIndex because the line/char positions maintained
3457 // for error reporting would be thrown off.
3460 if (fNextIndex
>= i
) {
3470 //------------------------------------------------------------------------------
3472 // scanProp Construct a UnicodeSet from the text at the current scan
3473 // position, which will be of the form \p{whaterver}
3475 // The scan position will be at the 'p' or 'P'. On return
3476 // the scan position should be just after the '}'
3478 // Return a UnicodeSet, constructed from the \P pattern,
3479 // or NULL if the pattern is invalid.
3481 //------------------------------------------------------------------------------
3482 UnicodeSet
*RegexCompile::scanProp() {
3483 UnicodeSet
*uset
= NULL
;
3485 if (U_FAILURE(*fStatus
)) {
3489 U_ASSERT(fC
.fChar
== chLowerP
|| fC
.fChar
== chUpperP
|| fC
.fChar
== chUpperN
);
3491 // enclose the \p{property} from the regex pattern source in [brackets]
3492 UnicodeString setPattern
;
3493 setPattern
.append(chLBracket
);
3494 setPattern
.append(chBackSlash
);
3496 setPattern
.append(fC
.fChar
);
3497 if (fC
.fChar
== chRBrace
) {
3501 if (fC
.fChar
== -1) {
3502 // Hit the end of the input string without finding the closing '}'
3503 error(U_REGEX_PROPERTY_SYNTAX
);
3507 setPattern
.append(chRBracket
);
3509 uint32_t usetFlags
= 0;
3510 if (fModeFlags
& UREGEX_CASE_INSENSITIVE
) {
3511 usetFlags
|= USET_CASE_INSENSITIVE
;
3513 if (fModeFlags
& UREGEX_COMMENTS
) {
3514 usetFlags
|= USET_IGNORE_SPACE
;
3517 // Build the UnicodeSet from the set pattern we just built up in a string.
3518 uset
= new UnicodeSet(setPattern
, usetFlags
, NULL
, *fStatus
);
3519 if (U_FAILURE(*fStatus
)) {
3524 nextChar(fC
); // Continue overall regex pattern processing with char after the '}'
3529 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS