5 // Copyright (C) 2002-2004 International Business Machines Corporation and others.
6 // All Rights Reserved.
8 // This file contains the ICU regular expression compiler, which is responsible
9 // for processing a regular expression pattern into the compiled form that
10 // is used by the match finding engine.
13 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
17 #include "unicode/unistr.h"
18 #include "unicode/uniset.h"
19 #include "unicode/uchar.h"
20 #include "unicode/uchriter.h"
21 #include "unicode/parsepos.h"
22 #include "unicode/parseerr.h"
23 #include "unicode/regex.h"
33 #include "regexcst.h" // Contains state table for the regex pattern parser.
34 // generated by a Perl script.
46 //----------------------------------------------------------------------------------------
50 //----------------------------------------------------------------------------------------
51 RegexCompile::RegexCompile(RegexPattern
*rxp
, UErrorCode
&status
) : fParenStack(status
)
62 fInBackslashQuote
= FALSE
;
63 fModeFlags
= fRXPat
->fFlags
;
67 fMatchCloseParen
= -1;
70 if (U_SUCCESS(status
) && U_FAILURE(rxp
->fDeferredStatus
)) {
71 status
= rxp
->fDeferredStatus
;
77 //----------------------------------------------------------------------------------------
81 //----------------------------------------------------------------------------------------
82 RegexCompile::~RegexCompile() {
85 //---------------------------------------------------------------------------------
87 // Compile regex pattern. The state machine for rexexp pattern parsing is here.
88 // The state tables are hand-written in the file regexcst.txt,
89 // and converted to the form used here by a perl
92 //---------------------------------------------------------------------------------
93 void RegexCompile::compile(
94 const UnicodeString
&pat
, // Source pat to be compiled.
95 UParseError
&pp
, // Error position info
96 UErrorCode
&e
) // Error Code
101 fStack
[fStackPtr
] = 0;
103 if (U_FAILURE(*fStatus
)) {
107 // There should be no pattern stuff in the RegexPattern object. They can not be reused.
108 U_ASSERT(fRXPat
->fPattern
.length() == 0);
110 // Prepare the RegexPattern object to receive the compiled pattern.
111 // TODO: remove per-instance field, and just use globals directly. (But check perf)
112 fRXPat
->fPattern
= pat
;
113 fRXPat
->fStaticSets
= RegexStaticSets::gStaticSets
->fPropSets
;
114 fRXPat
->fStaticSets8
= RegexStaticSets::gStaticSets
->fPropSets8
;
117 // Initialize the pattern scanning state machine
118 fPatternLength
= pat
.length();
120 const RegexTableEl
*tableEl
;
121 nextChar(fC
); // Fetch the first char from the pattern string.
124 // Main loop for the regex pattern parsing state machine.
125 // Runs once per state transition.
126 // Each time through optionally performs, depending on the state table,
127 // - an advance to the the next pattern char
128 // - an action to be performed.
129 // - pushing or popping a state to/from the local state return stack.
130 // file regexcst.txt is the source for the state table. The logic behind
131 // recongizing the pattern syntax is there, not here.
134 // Bail out if anything has gone wrong.
135 // Regex pattern parsing stops on the first error encountered.
136 if (U_FAILURE(*fStatus
)) {
140 U_ASSERT(state
!= 0);
142 // Find the state table element that matches the input char from the pattern, or the
143 // class of the input character. Start with the first table row for this
144 // state, then linearly scan forward until we find a row that matches the
145 // character. The last row for each state always matches all characters, so
146 // the search will stop there, if not before.
148 tableEl
= &gRuleParseStateTable
[state
];
149 REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d) state=%s ",
150 fC
.fChar
, fLineNum
, fCharNum
, RegexStateNames
[state
]));
152 for (;;) { // loop through table rows belonging to this state, looking for one
153 // that matches the current input char.
154 REGEX_SCAN_DEBUG_PRINTF(("."));
155 if (tableEl
->fCharClass
< 127 && fC
.fQuoted
== FALSE
&& tableEl
->fCharClass
== fC
.fChar
) {
156 // Table row specified an individual character, not a set, and
157 // the input character is not quoted, and
158 // the input character matched it.
161 if (tableEl
->fCharClass
== 255) {
162 // Table row specified default, match anything character class.
165 if (tableEl
->fCharClass
== 254 && fC
.fQuoted
) {
166 // Table row specified "quoted" and the char was quoted.
169 if (tableEl
->fCharClass
== 253 && fC
.fChar
== (UChar32
)-1) {
170 // Table row specified eof and we hit eof on the input.
174 if (tableEl
->fCharClass
>= 128 && tableEl
->fCharClass
< 240 && // Table specs a char class &&
175 fC
.fQuoted
== FALSE
&& // char is not escaped &&
176 fC
.fChar
!= (UChar32
)-1) { // char is not EOF
177 UnicodeSet
*uniset
= RegexStaticSets::gStaticSets
->fRuleSets
[tableEl
->fCharClass
-128];
178 if (uniset
->contains(fC
.fChar
)) {
179 // Table row specified a character class, or set of characters,
180 // and the current char matches it.
185 // No match on this row, advance to the next row for this state,
188 REGEX_SCAN_DEBUG_PRINTF(("\n"));
191 // We've found the row of the state table that matches the current input
192 // character from the rules string.
193 // Perform any action specified by this row in the state table.
194 if (doParseActions((EParseAction
)tableEl
->fAction
) == FALSE
) {
195 // Break out of the state machine loop if the
196 // the action signalled some kind of error, or
197 // the action was to exit, occurs on normal end-of-rules-input.
201 if (tableEl
->fPushState
!= 0) {
203 if (fStackPtr
>= kStackSize
) {
204 error(U_REGEX_INTERNAL_ERROR
);
205 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
208 fStack
[fStackPtr
] = tableEl
->fPushState
;
212 // NextChar. This is where characters are actually fetched from the pattern.
213 // Happens under control of the 'n' tag in the state table.
215 if (tableEl
->fNextChar
) {
219 // Get the next state from the table entry, or from the
220 // state stack if the next state was specified as "pop".
221 if (tableEl
->fNextState
!= 255) {
222 state
= tableEl
->fNextState
;
224 state
= fStack
[fStackPtr
];
227 // state stack underflow
228 // This will occur if the user pattern has mis-matched parentheses,
229 // with extra close parens.
232 error(U_REGEX_MISMATCHED_PAREN
);
239 // The pattern has now been read and processed, and the compiled code generated.
242 // Back-reference fixup
245 for (loc
=0; loc
<fRXPat
->fCompiledPat
->size(); loc
++) {
246 int32_t op
= fRXPat
->fCompiledPat
->elementAti(loc
);
247 int32_t opType
= URX_TYPE(op
);
248 if (opType
== URX_BACKREF
|| opType
== URX_BACKREF_I
) {
249 int32_t where
= URX_VAL(op
);
250 if (where
> fRXPat
->fGroupMap
->size()) {
251 error(U_REGEX_INVALID_BACK_REF
);
254 where
= fRXPat
->fGroupMap
->elementAti(where
-1);
255 op
= URX_BUILD(opType
, where
);
256 fRXPat
->fCompiledPat
->setElementAt(op
, loc
);
262 // Compute the number of digits requried for the largest capture group number.
264 fRXPat
->fMaxCaptureDigits
= 1;
267 if (n
> fRXPat
->fGroupMap
->size()) {
270 fRXPat
->fMaxCaptureDigits
++;
275 // The pattern's fFrameSize so far has accumulated the requirements for
276 // storage for capture parentheses, counters, etc. that are encountered
277 // in the pattern. Add space for the two variables that are always
278 // present in the saved state: the input string position and the
279 // position in the compiled pattern.
281 fRXPat
->fFrameSize
+=2;
284 // Get bounds for the minimum and maximum length of a string that this
285 // pattern can match. Used to avoid looking for matches in strings that
288 fRXPat
->fMinMatchLen
= minMatchLength(3, fRXPat
->fCompiledPat
->size()-1);
291 // Optimization passes
298 // Set up fast latin-1 range sets
300 int32_t numSets
= fRXPat
->fSets
->size();
301 fRXPat
->fSets8
= new Regex8BitSet
[numSets
];
303 for (i
=0; i
<numSets
; i
++) {
304 UnicodeSet
*s
= (UnicodeSet
*)fRXPat
->fSets
->elementAt(i
);
305 fRXPat
->fSets8
[i
].init(s
);
314 //----------------------------------------------------------------------------------------
316 // doParseAction Do some action during regex pattern parsing.
317 // Called by the parse state machine.
319 // Generation of the match engine PCode happens here, or
320 // in functions called from the parse actions defined here.
323 //----------------------------------------------------------------------------------------
324 UBool
RegexCompile::doParseActions(EParseAction action
)
326 UBool returnVal
= TRUE
;
328 switch ((Regex_PatternParseAction
)action
) {
331 // Start of pattern compiles to:
332 //0 SAVE 2 Fall back to position of FAIL
334 //2 FAIL Stop if we ever reach here.
335 //3 NOP Dummy, so start of pattern looks the same as
336 // the start of an ( grouping.
337 //4 NOP Resreved, will be replaced by a save if there are
338 // OR | operators at the top level
339 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_STATE_SAVE
, 2), *fStatus
);
340 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_JMP
, 3), *fStatus
);
341 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_FAIL
, 0), *fStatus
);
342 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_NOP
, 0), *fStatus
);
343 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_NOP
, 0), *fStatus
);
345 fParenStack
.push(-1, *fStatus
); // Begin a Paren Stack Frame
346 fParenStack
.push( 3, *fStatus
); // Push location of first NOP
350 // We've scanned to the end of the pattern
351 // The end of pattern compiles to:
353 // which will stop the runtime match engine.
354 // Encountering end of pattern also behaves like a close paren,
355 // and forces fixups of the State Save at the beginning of the compiled pattern
356 // and of any OR operations at the top level.
359 if (fParenStack
.size() > 0) {
360 // Missing close paren in pattern.
361 error(U_REGEX_MISMATCHED_PAREN
);
364 // add the END operation to the compiled pattern.
365 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_END
, 0), *fStatus
);
367 // Terminate the pattern compilation state machine.
374 // Scanning a '|', as in (A|B)
376 // Insert a SAVE operation at the start of the pattern section preceding
377 // this OR at this level. This SAVE will branch the match forward
378 // to the right hand side of the OR in the event that the left hand
379 // side fails to match and backtracks. Locate the position for the
380 // save from the location on the top of the parentheses stack.
381 int32_t savePosition
= fParenStack
.popi();
382 int32_t op
= fRXPat
->fCompiledPat
->elementAti(savePosition
);
383 U_ASSERT(URX_TYPE(op
) == URX_NOP
); // original contents of reserved location
384 op
= URX_BUILD(URX_STATE_SAVE
, fRXPat
->fCompiledPat
->size()+1);
385 fRXPat
->fCompiledPat
->setElementAt(op
, savePosition
);
387 // Append an JMP operation into the compiled pattern. The operand for
388 // the JMP will eventually be the location following the ')' for the
389 // group. This will be patched in later, when the ')' is encountered.
390 op
= URX_BUILD(URX_JMP
, 0);
391 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
393 // Push the position of the newly added JMP op onto the parentheses stack.
394 // This registers if for fixup when this block's close paren is encountered.
395 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
);
397 // Append a NOP to the compiled pattern. This is the slot reserved
398 // for a SAVE in the event that there is yet another '|' following
400 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_NOP
, 0), *fStatus
);
401 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
);
406 case doOpenCaptureParen
:
409 // - NOP, which later may be replaced by a save-state if the
410 // parenthesized group gets a * quantifier, followed by
411 // - START_CAPTURE n where n is stack frame offset to the capture group variables.
412 // - NOP, which may later be replaced by a save-state if there
413 // is an '|' alternation within the parens.
415 // Each capture group gets three slots in the save stack frame:
416 // 0: Capture Group start position (in input string being matched.)
417 // 1: Capture Group end positino.
418 // 2: Start of Match-in-progress.
419 // The first two locations are for a completed capture group, and are
420 // referred to by back references and the like.
421 // The third location stores the capture start position when an START_CAPTURE is
422 // encountered. This will be promoted to a completed capture when (and if) the corresponding
423 // END_CAPure is encountered.
425 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_NOP
, 0), *fStatus
);
426 int32_t varsLoc
= fRXPat
->fFrameSize
; // Reserve three slots in match stack frame.
427 fRXPat
->fFrameSize
+= 3;
428 int32_t cop
= URX_BUILD(URX_START_CAPTURE
, varsLoc
);
429 fRXPat
->fCompiledPat
->addElement(cop
, *fStatus
);
430 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_NOP
, 0), *fStatus
);
432 // On the Parentheses stack, start a new frame and add the postions
433 // of the two NOPs. Depending on what follows in the pattern, the
434 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
435 // address of the end of the parenthesized group.
436 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
437 fParenStack
.push(capturing
, *fStatus
); // Frame type.
438 fParenStack
.push(fRXPat
->fCompiledPat
->size()-3, *fStatus
); // The first NOP location
439 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP loc
441 // Save the mapping from group number to stack frame variable position.
442 fRXPat
->fGroupMap
->addElement(varsLoc
, *fStatus
);
446 case doOpenNonCaptureParen
:
447 // Open non-caputuring (grouping only) Paren.
449 // - NOP, which later may be replaced by a save-state if the
450 // parenthesized group gets a * quantifier, followed by
451 // - NOP, which may later be replaced by a save-state if there
452 // is an '|' alternation within the parens.
454 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_NOP
, 0), *fStatus
);
455 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_NOP
, 0), *fStatus
);
457 // On the Parentheses stack, start a new frame and add the postions
459 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
460 fParenStack
.push(plain
, *fStatus
); // Begin a new frame.
461 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The first NOP location
462 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP loc
467 case doOpenAtomicParen
:
468 // Open Atomic Paren. (?>
470 // - NOP, which later may be replaced if the parenthesized group
471 // has a quantifier, followed by
472 // - STO_SP save state stack position, so it can be restored at the ")"
473 // - NOP, which may later be replaced by a save-state if there
474 // is an '|' alternation within the parens.
476 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_NOP
, 0), *fStatus
);
477 int32_t varLoc
= fRXPat
->fDataSize
; // Reserve a data location for saving the
478 fRXPat
->fDataSize
+= 1; // state stack ptr.
479 int32_t stoOp
= URX_BUILD(URX_STO_SP
, varLoc
);
480 fRXPat
->fCompiledPat
->addElement(stoOp
, *fStatus
);
481 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_NOP
, 0), *fStatus
);
483 // On the Parentheses stack, start a new frame and add the postions
484 // of the two NOPs. Depending on what follows in the pattern, the
485 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
486 // address of the end of the parenthesized group.
487 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
488 fParenStack
.push(atomic
, *fStatus
); // Frame type.
489 fParenStack
.push(fRXPat
->fCompiledPat
->size()-3, *fStatus
); // The first NOP
490 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP
495 case doOpenLookAhead
:
496 // Positive Look-ahead (?= stuff )
498 // 1 START_LA dataLoc
499 // 2. NOP reserved for use by quantifiers on the block.
500 // Look-ahead can't have quantifiers, but paren stack
501 // compile time conventions require the slot anyhow.
502 // 3. NOP may be replaced if there is are '|' ops in the block.
503 // 4. code for parenthesized stuff.
506 // Two data slots are reserved, for saving the stack ptr and the input position.
508 int32_t dataLoc
= fRXPat
->fDataSize
;
509 fRXPat
->fDataSize
+= 2;
510 int32_t op
= URX_BUILD(URX_LA_START
, dataLoc
);
511 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
513 op
= URX_BUILD(URX_NOP
, 0);
514 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
515 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
517 // On the Parentheses stack, start a new frame and add the postions
519 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
520 fParenStack
.push(lookAhead
, *fStatus
); // Frame type.
521 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The first NOP location
522 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP location
526 case doOpenLookAheadNeg
:
527 // Negated Lookahead. (?! stuff )
529 // 1. START_LA dataloc
530 // 2. SAVE_STATE 7 // Fail within look-ahead block restores to this state,
531 // // which continues with the match.
532 // 3. NOP // Std. Open Paren sequence, for possible '|'
533 // 4. code for parenthesized stuff.
534 // 5. END_LA // Cut back stack, remove saved state from step 2.
535 // 6. FAIL // code in block succeeded, so neg. lookahead fails.
538 int32_t dataLoc
= fRXPat
->fDataSize
;
539 fRXPat
->fDataSize
+= 2;
540 int32_t op
= URX_BUILD(URX_LA_START
, dataLoc
);
541 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
543 op
= URX_BUILD(URX_STATE_SAVE
, 0); // dest address will be patched later.
544 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
546 op
= URX_BUILD(URX_NOP
, 0);
547 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
549 // On the Parentheses stack, start a new frame and add the postions
550 // of the StateSave and NOP.
551 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
552 fParenStack
.push( negLookAhead
, *fStatus
); // Frame type
553 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The STATE_SAVE location
554 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP location
556 // Instructions #5 and #6 will be added when the ')' is encountered.
560 case doOpenLookBehind
:
562 // Compile a (?<= look-behind open paren.
565 // 0 URX_LB_START dataLoc
566 // 1 URX_LB_CONT dataLoc
569 // 4 URX_NOP Standard '(' boilerplate.
570 // 5 URX_NOP Reserved slot for use with '|' ops within (block).
571 // 6 <code for LookBehind expression>
572 // 7 URX_LB_END dataLoc # Check match len, restore input len
573 // 8 URX_LA_END dataLoc # Restore stack, input pos
575 // Allocate a block of matcher data, to contain (when running a match)
576 // 0: Stack ptr on entry
577 // 1: Input Index on entry
578 // 2: Start index of match current match attempt.
579 // 3: Original Input String len.
581 // Allocate data space
582 int32_t dataLoc
= fRXPat
->fDataSize
;
583 fRXPat
->fDataSize
+= 4;
586 int32_t op
= URX_BUILD(URX_LB_START
, dataLoc
);
587 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
590 op
= URX_BUILD(URX_LB_CONT
, dataLoc
);
591 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
592 fRXPat
->fCompiledPat
->addElement(0, *fStatus
); // MinMatchLength. To be filled later.
593 fRXPat
->fCompiledPat
->addElement(0, *fStatus
); // MaxMatchLength. To be filled later.
596 op
= URX_BUILD(URX_NOP
, 0);
597 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
598 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
600 // On the Parentheses stack, start a new frame and add the postions
601 // of the URX_LB_CONT and the NOP.
602 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
603 fParenStack
.push(lookBehind
, *fStatus
); // Frame type
604 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The first NOP location
605 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The 2nd NOP location
607 // The final two instructions will be added when the ')' is encountered.
612 case doOpenLookBehindNeg
:
614 // Compile a (?<! negated look-behind open paren.
617 // 0 URX_LB_START dataLoc # Save entry stack, input len
618 // 1 URX_LBN_CONT dataLoc # Iterate possible match positions
622 // 5 URX_NOP Standard '(' boilerplate.
623 // 6 URX_NOP Reserved slot for use with '|' ops within (block).
624 // 7 <code for LookBehind expression>
625 // 8 URX_LBN_END dataLoc # Check match len, cause a FAIL
628 // Allocate a block of matcher data, to contain (when running a match)
629 // 0: Stack ptr on entry
630 // 1: Input Index on entry
631 // 2: Start index of match current match attempt.
632 // 3: Original Input String len.
634 // Allocate data space
635 int32_t dataLoc
= fRXPat
->fDataSize
;
636 fRXPat
->fDataSize
+= 4;
639 int32_t op
= URX_BUILD(URX_LB_START
, dataLoc
);
640 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
643 op
= URX_BUILD(URX_LBN_CONT
, dataLoc
);
644 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
645 fRXPat
->fCompiledPat
->addElement(0, *fStatus
); // MinMatchLength. To be filled later.
646 fRXPat
->fCompiledPat
->addElement(0, *fStatus
); // MaxMatchLength. To be filled later.
647 fRXPat
->fCompiledPat
->addElement(0, *fStatus
); // Continue Loc. To be filled later.
650 op
= URX_BUILD(URX_NOP
, 0);
651 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
652 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
654 // On the Parentheses stack, start a new frame and add the postions
655 // of the URX_LB_CONT and the NOP.
656 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
657 fParenStack
.push(lookBehindN
, *fStatus
); // Frame type
658 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The first NOP location
659 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The 2nd NOP location
661 // The final two instructions will be added when the ')' is encountered.
665 case doConditionalExpr
:
666 // Conditionals such as (?(1)a:b)
668 // Perl inline-condtionals. (?{perl code}a|b) We're not perl, no way to do them.
669 error(U_REGEX_UNIMPLEMENTED
);
675 if (fParenStack
.size() <= 0) {
676 // Extra close paren, or missing open paren.
677 error(U_REGEX_MISMATCHED_PAREN
);
685 case doBadOpenParenType
:
687 error(U_REGEX_RULE_SYNTAX
);
691 case doMismatchedParenErr
:
692 error(U_REGEX_MISMATCHED_PAREN
);
696 // Normal '+' compiles to
697 // 1. stuff to be repeated (already built)
701 // Or, if the item to be repeated can match a zero length string,
702 // 1. STO_INP_LOC data-loc
703 // 2. body of stuff to be repeated
708 // Or, if the item to be repeated is simple
709 // 1. Item to be repeated.
710 // 2. LOOP_SR_I set number (assuming repeated item is a set ref)
711 // 3. LOOP_C stack location
713 int32_t topLoc
= blockTopLoc(FALSE
); // location of item #1
716 // Check for simple constructs, which may get special optimized code.
717 if (topLoc
== fRXPat
->fCompiledPat
->size() - 1) {
718 int32_t repeatedOp
= fRXPat
->fCompiledPat
->elementAti(topLoc
);
720 if (URX_TYPE(repeatedOp
) == URX_SETREF
) {
721 // Emit optimized code for [char set]+
722 int32_t loopOpI
= URX_BUILD(URX_LOOP_SR_I
, URX_VAL(repeatedOp
));
723 fRXPat
->fCompiledPat
->addElement(loopOpI
, *fStatus
);
724 frameLoc
= fRXPat
->fFrameSize
;
725 fRXPat
->fFrameSize
++;
726 int32_t loopOpC
= URX_BUILD(URX_LOOP_C
, frameLoc
);
727 fRXPat
->fCompiledPat
->addElement(loopOpC
, *fStatus
);
731 if (URX_TYPE(repeatedOp
) == URX_DOTANY
||
732 URX_TYPE(repeatedOp
) == URX_DOTANY_ALL
) {
733 // Emit Optimized code for .+ operations.
734 int32_t loopOpI
= URX_BUILD(URX_LOOP_DOT_I
, 0);
735 if (URX_TYPE(repeatedOp
) == URX_DOTANY_ALL
) {
736 // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
739 fRXPat
->fCompiledPat
->addElement(loopOpI
, *fStatus
);
740 frameLoc
= fRXPat
->fFrameSize
;
741 fRXPat
->fFrameSize
++;
742 int32_t loopOpC
= URX_BUILD(URX_LOOP_C
, frameLoc
);
743 fRXPat
->fCompiledPat
->addElement(loopOpC
, *fStatus
);
751 // Check for minimum match length of zero, which requires
752 // extra loop-breaking code.
753 if (minMatchLength(topLoc
, fRXPat
->fCompiledPat
->size()-1) == 0) {
754 // Zero length match is possible.
755 // Emit the code sequence that can handle it.
757 frameLoc
= fRXPat
->fFrameSize
;
758 fRXPat
->fFrameSize
++;
760 int32_t op
= URX_BUILD(URX_STO_INP_LOC
, frameLoc
);
761 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
);
763 op
= URX_BUILD(URX_JMP_SAV_X
, topLoc
+1);
764 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
766 // Simpler code when the repeated body must match something non-empty
767 int32_t jmpOp
= URX_BUILD(URX_JMP_SAV
, topLoc
);
768 fRXPat
->fCompiledPat
->addElement(jmpOp
, *fStatus
);
774 // Non-greedy '+?' compiles to
775 // 1. stuff to be repeated (already built)
779 int32_t topLoc
= blockTopLoc(FALSE
);
780 int32_t saveStateOp
= URX_BUILD(URX_STATE_SAVE
, topLoc
);
781 fRXPat
->fCompiledPat
->addElement(saveStateOp
, *fStatus
);
787 // Normal (greedy) ? quantifier.
790 // 2. body of optional block
792 // Insert the state save into the compiled pattern, and we're done.
794 int32_t saveStateLoc
= blockTopLoc(TRUE
);
795 int32_t saveStateOp
= URX_BUILD(URX_STATE_SAVE
, fRXPat
->fCompiledPat
->size());
796 fRXPat
->fCompiledPat
->setElementAt(saveStateOp
, saveStateLoc
);
801 // Non-greedy ?? quantifier
804 // 2. body of optional block
808 // This code is less than ideal, with two jmps instead of one, because we can only
809 // insert one instruction at the top of the block being iterated.
811 int32_t jmp1_loc
= blockTopLoc(TRUE
);
812 int32_t jmp2_loc
= fRXPat
->fCompiledPat
->size();
814 int32_t jmp1_op
= URX_BUILD(URX_JMP
, jmp2_loc
+1);
815 fRXPat
->fCompiledPat
->setElementAt(jmp1_op
, jmp1_loc
);
817 int32_t jmp2_op
= URX_BUILD(URX_JMP
, jmp2_loc
+2);
818 fRXPat
->fCompiledPat
->addElement(jmp2_op
, *fStatus
);
820 int32_t save_op
= URX_BUILD(URX_STATE_SAVE
, jmp1_loc
+1);
821 fRXPat
->fCompiledPat
->addElement(save_op
, *fStatus
);
827 // Normal (greedy) * quantifier.
830 // 2. body of stuff being iterated over
834 // Or, if the body is a simple [Set],
835 // 1. LOOP_SR_I set number
836 // 2. LOOP_C stack location
839 // Or if this is a .*
840 // 1. LOOP_DOT_I (. matches all mode flag)
841 // 2. LOOP_C stack location
843 // Or, if the body can match a zero-length string, to inhibit infinite loops,
845 // 2. STO_INP_LOC data-loc
850 // location of item #1, the STATE_SAVE
851 int32_t topLoc
= blockTopLoc(FALSE
);
852 int32_t dataLoc
= -1;
854 // Check for simple *, where the construct being repeated
855 // compiled to single opcode, and might be optimizable.
856 if (topLoc
== fRXPat
->fCompiledPat
->size() - 1) {
857 int32_t repeatedOp
= fRXPat
->fCompiledPat
->elementAti(topLoc
);
859 if (URX_TYPE(repeatedOp
) == URX_SETREF
) {
860 // Emit optimized code for a [char set]*
861 int32_t loopOpI
= URX_BUILD(URX_LOOP_SR_I
, URX_VAL(repeatedOp
));
862 fRXPat
->fCompiledPat
->setElementAt(loopOpI
, topLoc
);
863 dataLoc
= fRXPat
->fFrameSize
;
864 fRXPat
->fFrameSize
++;
865 int32_t loopOpC
= URX_BUILD(URX_LOOP_C
, dataLoc
);
866 fRXPat
->fCompiledPat
->addElement(loopOpC
, *fStatus
);
870 if (URX_TYPE(repeatedOp
) == URX_DOTANY
||
871 URX_TYPE(repeatedOp
) == URX_DOTANY_ALL
) {
872 // Emit Optimized code for .* operations.
873 int32_t loopOpI
= URX_BUILD(URX_LOOP_DOT_I
, 0);
874 if (URX_TYPE(repeatedOp
) == URX_DOTANY_ALL
) {
875 // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
878 fRXPat
->fCompiledPat
->setElementAt(loopOpI
, topLoc
);
879 dataLoc
= fRXPat
->fFrameSize
;
880 fRXPat
->fFrameSize
++;
881 int32_t loopOpC
= URX_BUILD(URX_LOOP_C
, dataLoc
);
882 fRXPat
->fCompiledPat
->addElement(loopOpC
, *fStatus
);
887 // Emit general case code for this *
888 // The optimizations did not apply.
890 int32_t saveStateLoc
= blockTopLoc(TRUE
);
891 int32_t jmpOp
= URX_BUILD(URX_JMP_SAV
, saveStateLoc
+1);
893 // Check for minimum match length of zero, which requires
894 // extra loop-breaking code.
895 if (minMatchLength(saveStateLoc
, fRXPat
->fCompiledPat
->size()-1) == 0) {
896 insertOp(saveStateLoc
);
897 dataLoc
= fRXPat
->fFrameSize
;
898 fRXPat
->fFrameSize
++;
900 int32_t op
= URX_BUILD(URX_STO_INP_LOC
, dataLoc
);
901 fRXPat
->fCompiledPat
->setElementAt(op
, saveStateLoc
+1);
902 jmpOp
= URX_BUILD(URX_JMP_SAV_X
, saveStateLoc
+2);
905 // Locate the position in the compiled pattern where the match will continue
906 // after completing the *. (4 or 5 in the comment above)
907 int32_t continueLoc
= fRXPat
->fCompiledPat
->size()+1;
909 // Put together the save state op store it into the compiled code.
910 int32_t saveStateOp
= URX_BUILD(URX_STATE_SAVE
, continueLoc
);
911 fRXPat
->fCompiledPat
->setElementAt(saveStateOp
, saveStateLoc
);
913 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
914 fRXPat
->fCompiledPat
->addElement(jmpOp
, *fStatus
);
919 // Non-greedy *? quantifier
922 // 2. body of stuff being iterated over
926 int32_t jmpLoc
= blockTopLoc(TRUE
); // loc 1.
927 int32_t saveLoc
= fRXPat
->fCompiledPat
->size(); // loc 3.
928 int32_t jmpOp
= URX_BUILD(URX_JMP
, saveLoc
);
929 int32_t stateSaveOp
= URX_BUILD(URX_STATE_SAVE
, jmpLoc
+1);
930 fRXPat
->fCompiledPat
->setElementAt(jmpOp
, jmpLoc
);
931 fRXPat
->fCompiledPat
->addElement(stateSaveOp
, *fStatus
);
937 // The '{' opening an interval quantifier was just scanned.
938 // Init the counter varaiables that will accumulate the values as the digits
944 case doIntevalLowerDigit
:
945 // Scanned a digit from the lower value of an {lower,upper} interval
947 int32_t digitValue
= u_charDigitValue(fC
.fChar
);
948 U_ASSERT(digitValue
>= 0);
949 fIntervalLow
= fIntervalLow
*10 + digitValue
;
950 if (fIntervalLow
< 0) {
951 error(U_REGEX_NUMBER_TOO_BIG
);
956 case doIntervalUpperDigit
:
957 // Scanned a digit from the upper value of an {lower,upper} interval
959 if (fIntervalUpper
< 0) {
962 int32_t digitValue
= u_charDigitValue(fC
.fChar
);
963 U_ASSERT(digitValue
>= 0);
964 fIntervalUpper
= fIntervalUpper
*10 + digitValue
;
965 if (fIntervalUpper
< 0) {
966 error(U_REGEX_NUMBER_TOO_BIG
);
972 // Scanned a single value interval like {27}. Upper = Lower.
973 fIntervalUpper
= fIntervalLow
;
977 // Finished scanning a normal {lower,upper} interval. Generate the code for it.
978 if (compileInlineInterval() == FALSE
) {
979 compileInterval(URX_CTR_INIT
, URX_CTR_LOOP
);
983 case doPossessiveInterval
:
984 // Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it.
986 // Remember the loc for the top of the block being looped over.
987 // (Can not reserve a slot in the compiled pattern at this time, becuase
988 // compileInterval needs to reserve also, and blockTopLoc can only reserve
990 int32_t topLoc
= blockTopLoc(FALSE
);
992 // Produce normal looping code.
993 compileInterval(URX_CTR_INIT
, URX_CTR_LOOP
);
995 // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
996 // just as if the loop was inclosed in atomic parentheses.
998 // First the STO_SP before the start of the loop
1000 int32_t varLoc
= fRXPat
->fDataSize
; // Reserve a data location for saving the
1001 fRXPat
->fDataSize
+= 1; // state stack ptr.
1002 int32_t op
= URX_BUILD(URX_STO_SP
, varLoc
);
1003 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
);
1005 int32_t loopOp
= fRXPat
->fCompiledPat
->popi();
1006 U_ASSERT(URX_TYPE(loopOp
) == URX_CTR_LOOP
&& URX_VAL(loopOp
) == topLoc
);
1007 loopOp
++; // point LoopOp after the just-inserted STO_SP
1008 fRXPat
->fCompiledPat
->push(loopOp
, *fStatus
);
1010 // Then the LD_SP after the end of the loop
1011 op
= URX_BUILD(URX_LD_SP
, varLoc
);
1012 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1018 // Finished scanning a non-greedy {lower,upper}? interval. Generate the code for it.
1019 compileInterval(URX_CTR_INIT_NG
, URX_CTR_LOOP_NG
);
1022 case doIntervalError
:
1023 error(U_REGEX_BAD_INTERVAL
);
1027 // We've just scanned a "normal" character from the pattern,
1028 literalChar(fC
.fChar
);
1034 // scanned a ".", match any single character.
1037 if (fModeFlags
& UREGEX_DOTALL
) {
1038 op
= URX_BUILD(URX_DOTANY_ALL
, 0);
1040 op
= URX_BUILD(URX_DOTANY
, 0);
1042 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1048 int32_t op
= (fModeFlags
& UREGEX_MULTILINE
)? URX_CARET_M
: URX_CARET
;
1049 fRXPat
->fCompiledPat
->addElement(URX_BUILD(op
, 0), *fStatus
);
1056 int32_t op
= (fModeFlags
& UREGEX_MULTILINE
)? URX_DOLLAR_M
: URX_DOLLAR
;
1057 fRXPat
->fCompiledPat
->addElement(URX_BUILD(op
, 0), *fStatus
);
1062 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_CARET
, 0), *fStatus
);
1067 #if UCONFIG_NO_BREAK_ITERATION==1
1068 if (fModeFlags
& UREGEX_UWORD
) {
1069 error(U_UNSUPPORTED_ERROR
);
1072 int32_t op
= (fModeFlags
& UREGEX_UWORD
)? URX_BACKSLASH_BU
: URX_BACKSLASH_B
;
1073 fRXPat
->fCompiledPat
->addElement(URX_BUILD(op
, 1), *fStatus
);
1079 #if UCONFIG_NO_BREAK_ITERATION==1
1080 if (fModeFlags
& UREGEX_UWORD
) {
1081 error(U_UNSUPPORTED_ERROR
);
1084 int32_t op
= (fModeFlags
& UREGEX_UWORD
)? URX_BACKSLASH_BU
: URX_BACKSLASH_B
;
1085 fRXPat
->fCompiledPat
->addElement(URX_BUILD(op
, 0), *fStatus
);
1090 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_BACKSLASH_D
, 1), *fStatus
);
1094 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_BACKSLASH_D
, 0), *fStatus
);
1098 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_BACKSLASH_G
, 0), *fStatus
);
1102 fRXPat
->fCompiledPat
->addElement(
1103 URX_BUILD(URX_STAT_SETREF_N
, URX_ISSPACE_SET
), *fStatus
);
1107 fRXPat
->fCompiledPat
->addElement(
1108 URX_BUILD(URX_STATIC_SETREF
, URX_ISSPACE_SET
), *fStatus
);
1112 fRXPat
->fCompiledPat
->addElement(
1113 URX_BUILD(URX_STAT_SETREF_N
, URX_ISWORD_SET
), *fStatus
);
1117 fRXPat
->fCompiledPat
->addElement(
1118 URX_BUILD(URX_STATIC_SETREF
, URX_ISWORD_SET
), *fStatus
);
1122 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_BACKSLASH_X
, 0), *fStatus
);
1127 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_DOLLAR
, 0), *fStatus
);
1131 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_BACKSLASH_Z
, 0), *fStatus
);
1135 error(U_REGEX_BAD_ESCAPE_SEQUENCE
);
1144 UnicodeSet
*theSet
= scanProp();
1150 case doScanUnicodeSet
:
1152 UnicodeSet
*theSet
= scanSet();
1157 case doEnterQuoteMode
:
1158 // Just scanned a \Q. Put character scanner into quote mode.
1163 // BackReference. Somewhat unusual in that the front-end can not completely parse
1164 // the regular expression, because the number of digits to be consumed
1165 // depends on the number of capture groups that have been defined. So
1166 // we have to do it here instead.
1168 int32_t numCaptureGroups
= fRXPat
->fGroupMap
->size();
1169 int32_t groupNum
= 0;
1170 UChar32 c
= fC
.fChar
;
1173 // Loop once per digit, for max allowed number of digits in a back reference.
1174 int32_t digit
= u_charDigitValue(c
);
1175 groupNum
= groupNum
* 10 + digit
;
1176 if (groupNum
>= numCaptureGroups
) {
1180 if (RegexStaticSets::gStaticSets
->fRuleDigits
->contains(c
) == FALSE
) {
1186 // Scan of the back reference in the source regexp is complete. Now generate
1187 // the compiled code for it.
1188 // Because capture groups can be forward-referenced by back-references,
1189 // we fill the operand with the capture group number. At the end
1190 // of compilation, it will be changed to the variable's location.
1191 U_ASSERT(groupNum
> 0);
1193 if (fModeFlags
& UREGEX_CASE_INSENSITIVE
) {
1194 op
= URX_BUILD(URX_BACKREF_I
, groupNum
);
1196 op
= URX_BUILD(URX_BACKREF
, groupNum
);
1198 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1203 case doPossessivePlus
:
1204 // Possessive ++ quantifier.
1207 // 2. body of stuff being iterated over
1213 // Note: TODO: This is pretty inefficient. A mass of saved state is built up
1214 // then unconditionally discarded. Perhaps introduce a new opcode
1218 int32_t topLoc
= blockTopLoc(TRUE
);
1219 int32_t stoLoc
= fRXPat
->fDataSize
;
1220 fRXPat
->fDataSize
++; // Reserve the data location for storing save stack ptr.
1221 int32_t op
= URX_BUILD(URX_STO_SP
, stoLoc
);
1222 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
);
1224 // Emit the STATE_SAVE
1225 op
= URX_BUILD(URX_STATE_SAVE
, fRXPat
->fCompiledPat
->size()+2);
1226 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1229 op
= URX_BUILD(URX_JMP
, topLoc
+1);
1230 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1233 op
= URX_BUILD(URX_LD_SP
, stoLoc
);
1234 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1238 case doPossessiveStar
:
1239 // Possessive *+ quantifier.
1243 // 3. body of stuff being iterated over
1247 // TODO: do something to cut back the state stack each time through the loop.
1249 // Reserve two slots at the top of the block.
1250 int32_t topLoc
= blockTopLoc(TRUE
);
1254 int32_t stoLoc
= fRXPat
->fDataSize
;
1255 fRXPat
->fDataSize
++; // Reserve the data location for storing save stack ptr.
1256 int32_t op
= URX_BUILD(URX_STO_SP
, stoLoc
);
1257 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
);
1259 // Emit the SAVE_STATE 5
1260 int32_t L7
= fRXPat
->fCompiledPat
->size()+1;
1261 op
= URX_BUILD(URX_STATE_SAVE
, L7
);
1262 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
+1);
1264 // Append the JMP operation.
1265 op
= URX_BUILD(URX_JMP
, topLoc
+1);
1266 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1268 // Emit the LD_SP loc
1269 op
= URX_BUILD(URX_LD_SP
, stoLoc
);
1270 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1274 case doPossessiveOpt
:
1275 // Possessive ?+ quantifier.
1279 // 3. body of optional block
1284 // Reserve two slots at the top of the block.
1285 int32_t topLoc
= blockTopLoc(TRUE
);
1289 int32_t stoLoc
= fRXPat
->fDataSize
;
1290 fRXPat
->fDataSize
++; // Reserve the data location for storing save stack ptr.
1291 int32_t op
= URX_BUILD(URX_STO_SP
, stoLoc
);
1292 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
);
1294 // Emit the SAVE_STATE
1295 int32_t continueLoc
= fRXPat
->fCompiledPat
->size()+1;
1296 op
= URX_BUILD(URX_STATE_SAVE
, continueLoc
);
1297 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
+1);
1300 op
= URX_BUILD(URX_LD_SP
, stoLoc
);
1301 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1306 case doBeginMatchMode
:
1307 fNewModeFlags
= fModeFlags
;
1308 fSetModeFlag
= TRUE
;
1311 case doMatchMode
: // (?i) and similar
1315 case 0x69: /* 'i' */ bit
= UREGEX_CASE_INSENSITIVE
; break;
1316 case 0x6d: /* 'm' */ bit
= UREGEX_MULTILINE
; break;
1317 case 0x73: /* 's' */ bit
= UREGEX_DOTALL
; break;
1318 case 0x77: /* 'w' */ bit
= UREGEX_UWORD
; break;
1319 case 0x78: /* 'x' */ bit
= UREGEX_COMMENTS
; break;
1320 case 0x2d: /* '-' */ fSetModeFlag
= FALSE
; break;
1322 U_ASSERT(FALSE
); // Should never happen. Other chars are filtered out
1326 fNewModeFlags
|= bit
;
1328 fNewModeFlags
&= ~bit
;
1333 case doSetMatchMode
:
1334 // We've got a (?i) or similar. The match mode is being changed, but
1335 // the change is not scoped to a parenthesized block.
1336 fModeFlags
= fNewModeFlags
;
1338 // Prevent any string from spanning across the change of match mode.
1339 // Otherwise the pattern "abc(?i)def" would make a single string of "abcdef"
1344 case doMatchModeParen
:
1345 // We've got a (?i: or similar. Begin a parenthesized block, save old
1346 // mode flags so they can be restored at the close of the block.
1349 // - NOP, which later may be replaced by a save-state if the
1350 // parenthesized group gets a * quantifier, followed by
1351 // - NOP, which may later be replaced by a save-state if there
1352 // is an '|' alternation within the parens.
1354 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_NOP
, 0), *fStatus
);
1355 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_NOP
, 0), *fStatus
);
1357 // On the Parentheses stack, start a new frame and add the postions
1358 // of the two NOPs (a normal non-capturing () frame, except for the
1359 // saving of the orignal mode flags.)
1360 fParenStack
.push(fModeFlags
, *fStatus
);
1361 fParenStack
.push(flags
, *fStatus
); // Frame Marker
1362 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The first NOP
1363 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP
1365 // Set the current mode flags to the new values.
1366 fModeFlags
= fNewModeFlags
;
1371 error(U_REGEX_INVALID_FLAG
);
1374 case doSuppressComments
:
1375 // We have just scanned a '(?'. We now need to prevent the character scanner from
1376 // treating a '#' as a to-the-end-of-line comment.
1377 // (This Perl compatibility just gets uglier and uglier to do...)
1378 fEOLComments
= FALSE
;
1385 error(U_REGEX_INTERNAL_ERROR
);
1389 if (U_FAILURE(*fStatus
)) {
1398 //------------------------------------------------------------------------------
1400 // literalChar We've encountered a literal character from the pattern,
1401 // or an escape sequence that reduces to a character.
1402 // Add it to the string containing all literal chars/strings from
1404 // If we are in a pattern string already, add the new char to it.
1405 // If we aren't in a pattern string, begin one now.
1407 //------------------------------------------------------------------------------
1408 void RegexCompile::literalChar(UChar32 c
) {
1409 int32_t op
; // An operation in the compiled pattern.
1411 int32_t patternLoc
; // A position in the compiled pattern.
1415 // If the last thing compiled into the pattern was not a literal char,
1416 // force this new literal char to begin a new string, and not append to the previous.
1417 op
= fRXPat
->fCompiledPat
->lastElementi();
1418 opType
= URX_TYPE(op
);
1419 if (!(opType
== URX_STRING_LEN
|| opType
== URX_ONECHAR
|| opType
== URX_ONECHAR_I
)) {
1423 if (fStringOpStart
== -1) {
1424 // First char of a string in the pattern.
1425 // Emit a OneChar op into the compiled pattern.
1428 // Also add it to the string pool, in case we get a second adjacent literal
1429 // and want to change form ONE_CHAR to STRING
1430 fStringOpStart
= fRXPat
->fLiteralText
.length();
1431 fRXPat
->fLiteralText
.append(c
);
1435 // We are adding onto an existing string
1436 fRXPat
->fLiteralText
.append(c
);
1438 op
= fRXPat
->fCompiledPat
->lastElementi();
1439 opType
= URX_TYPE(op
);
1440 U_ASSERT(opType
== URX_ONECHAR
|| opType
== URX_ONECHAR_I
|| opType
== URX_STRING_LEN
);
1442 // If the most recently emitted op is a URX_ONECHAR,
1443 if (opType
== URX_ONECHAR
|| opType
== URX_ONECHAR_I
) {
1444 if (U16_IS_TRAIL(c
) && U16_IS_LEAD(URX_VAL(op
))) {
1445 // The most recently emitted op is a ONECHAR that was the first half
1446 // of a surrogate pair. Update the ONECHAR's operand to be the
1447 // supplementary code point resulting from both halves of the pair.
1448 c
= U16_GET_SUPPLEMENTARY(URX_VAL(op
), c
);
1449 op
= URX_BUILD(opType
, c
);
1450 patternLoc
= fRXPat
->fCompiledPat
->size() - 1;
1451 fRXPat
->fCompiledPat
->setElementAt(op
, patternLoc
);
1455 // The most recently emitted op is a ONECHAR.
1456 // We've now received another adjacent char. Change the ONECHAR op
1458 if (fModeFlags
& UREGEX_CASE_INSENSITIVE
) {
1459 op
= URX_BUILD(URX_STRING_I
, fStringOpStart
);
1461 op
= URX_BUILD(URX_STRING
, fStringOpStart
);
1463 patternLoc
= fRXPat
->fCompiledPat
->size() - 1;
1464 fRXPat
->fCompiledPat
->setElementAt(op
, patternLoc
);
1465 op
= URX_BUILD(URX_STRING_LEN
, 0);
1466 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1469 // The pattern contains a URX_SRING / URX_STRING_LEN. Update the
1470 // string length to reflect the new char we just added to the string.
1471 stringLen
= fRXPat
->fLiteralText
.length() - fStringOpStart
;
1472 op
= URX_BUILD(URX_STRING_LEN
, stringLen
);
1473 patternLoc
= fRXPat
->fCompiledPat
->size() - 1;
1474 fRXPat
->fCompiledPat
->setElementAt(op
, patternLoc
);
1479 //------------------------------------------------------------------------------
1481 // emitONE_CHAR emit a ONE_CHAR op into the generated code.
1482 // Choose cased or uncased version, depending on the
1483 // match mode and whether the character itself is cased.
1485 //------------------------------------------------------------------------------
1486 void RegexCompile::emitONE_CHAR(UChar32 c
) {
1488 if ((fModeFlags
& UREGEX_CASE_INSENSITIVE
) &&
1489 u_hasBinaryProperty(c
, UCHAR_CASE_SENSITIVE
)) {
1490 // We have a cased character, and are in case insensitive matching mode.
1491 c
= u_foldCase(c
, U_FOLD_CASE_DEFAULT
);
1492 op
= URX_BUILD(URX_ONECHAR_I
, c
);
1494 // Uncased char, or case sensitive match mode.
1495 // Either way, just generate a literal compare of the char.
1496 op
= URX_BUILD(URX_ONECHAR
, c
);
1498 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1502 //------------------------------------------------------------------------------
1504 // fixLiterals When compiling something that can follow a literal
1505 // string in a pattern, we need to "fix" any preceding
1506 // string, which will cause any subsequent literals to
1507 // begin a new string, rather than appending to the
1510 // Optionally, split the last char of the string off into
1511 // a single "ONE_CHAR" operation, so that quantifiers can
1512 // apply to that char alone. Example: abc*
1513 // The * must apply to the 'c' only.
1515 //------------------------------------------------------------------------------
1516 void RegexCompile::fixLiterals(UBool split
) {
1517 int32_t stringStart
= fStringOpStart
; // start index of the current literal string
1518 int32_t op
; // An op from/for the compiled pattern.
1519 int32_t opType
; // An opcode type from the compiled pattern.
1520 int32_t stringLastCharIdx
;
1522 int32_t stringNextToLastCharIdx
;
1523 UChar32 nextToLastChar
;
1526 fStringOpStart
= -1;
1531 // Split: We need to ensure that the last item in the compiled pattern does
1532 // not refer to a literal string of more than one char. If it does,
1533 // separate the last char from the rest of the string.
1535 // If the last operation from the compiled pattern is not a string,
1536 // nothing needs to be done
1537 op
= fRXPat
->fCompiledPat
->lastElementi();
1538 opType
= URX_TYPE(op
);
1539 if (opType
!= URX_STRING_LEN
) {
1542 stringLen
= URX_VAL(op
);
1545 // Find the position of the last code point in the string (might be a surrogate pair)
1547 stringLastCharIdx
= fRXPat
->fLiteralText
.length();
1548 stringLastCharIdx
= fRXPat
->fLiteralText
.moveIndex32(stringLastCharIdx
, -1);
1549 lastChar
= fRXPat
->fLiteralText
.char32At(stringLastCharIdx
);
1551 // The string should always be at least two code points long, meaning that there
1552 // should be something before the last char position that we just found.
1553 U_ASSERT(stringLastCharIdx
> stringStart
);
1554 stringNextToLastCharIdx
= fRXPat
->fLiteralText
.moveIndex32(stringLastCharIdx
, -1);
1555 U_ASSERT(stringNextToLastCharIdx
>= stringStart
);
1556 nextToLastChar
= fRXPat
->fLiteralText
.char32At(stringNextToLastCharIdx
);
1558 if (stringNextToLastCharIdx
> stringStart
) {
1559 // The length of string remaining after removing one char is two or more.
1560 // Leave the string in the compiled pattern, shorten it by one char,
1561 // and append a URX_ONECHAR op for the last char.
1562 stringLen
-= (fRXPat
->fLiteralText
.length() - stringLastCharIdx
);
1563 op
= URX_BUILD(URX_STRING_LEN
, stringLen
);
1564 fRXPat
->fCompiledPat
->setElementAt(op
, fRXPat
->fCompiledPat
->size() -1);
1565 emitONE_CHAR(lastChar
);
1567 // The original string consisted of exactly two characters. Replace
1568 // the existing compiled URX_STRING/URX_STRING_LEN ops with a pair
1570 fRXPat
->fCompiledPat
->setSize(fRXPat
->fCompiledPat
->size() -2);
1571 emitONE_CHAR(nextToLastChar
);
1572 emitONE_CHAR(lastChar
);
1581 //------------------------------------------------------------------------------
1583 // insertOp() Insert a slot for a new opcode into the already
1584 // compiled pattern code.
1586 // Fill the slot with a NOP. Our caller will replace it
1587 // with what they really wanted.
1589 //------------------------------------------------------------------------------
1590 void RegexCompile::insertOp(int32_t where
) {
1591 UVector32
*code
= fRXPat
->fCompiledPat
;
1592 U_ASSERT(where
>0 && where
< code
->size());
1594 int32_t nop
= URX_BUILD(URX_NOP
, 0);
1595 code
->insertElementAt(nop
, where
, *fStatus
);
1597 // Walk through the pattern, looking for any ops with targets that
1598 // were moved down by the insert. Fix them.
1600 for (loc
=0; loc
<code
->size(); loc
++) {
1601 int32_t op
= code
->elementAti(loc
);
1602 int32_t opType
= URX_TYPE(op
);
1603 int32_t opValue
= URX_VAL(op
);
1604 if ((opType
== URX_JMP
||
1605 opType
== URX_JMPX
||
1606 opType
== URX_STATE_SAVE
||
1607 opType
== URX_CTR_LOOP
||
1608 opType
== URX_CTR_LOOP_NG
||
1609 opType
== URX_JMP_SAV
||
1610 opType
== URX_RELOC_OPRND
) && opValue
> where
) {
1611 // Target location for this opcode is after the insertion point and
1612 // needs to be incremented to adjust for the insertion.
1614 op
= URX_BUILD(opType
, opValue
);
1615 code
->setElementAt(op
, loc
);
1619 // Now fix up the parentheses stack. All positive values in it are locations in
1620 // the compiled pattern. (Negative values are frame boundaries, and don't need fixing.)
1621 for (loc
=0; loc
<fParenStack
.size(); loc
++) {
1622 int32_t x
= fParenStack
.elementAti(loc
);
1625 fParenStack
.setElementAt(x
, loc
);
1629 if (fMatchCloseParen
> where
) {
1632 if (fMatchOpenParen
> where
) {
1639 //------------------------------------------------------------------------------
1641 // blockTopLoc() Find or create a location in the compiled pattern
1642 // at the start of the operation or block that has
1643 // just been compiled. Needed when a quantifier (* or
1644 // whatever) appears, and we need to add an operation
1645 // at the start of the thing being quantified.
1647 // (Parenthesized Blocks) have a slot with a NOP that
1648 // is reserved for this purpose. .* or similar don't
1649 // and a slot needs to be added.
1651 // parameter reserveLoc : TRUE - ensure that there is space to add an opcode
1652 // at the returned location.
1653 // FALSE - just return the address,
1654 // do not reserve a location there.
1656 //------------------------------------------------------------------------------
1657 int32_t RegexCompile::blockTopLoc(UBool reserveLoc
) {
1659 if (fRXPat
->fCompiledPat
->size() == fMatchCloseParen
)
1661 // The item just processed is a parenthesized block.
1662 theLoc
= fMatchOpenParen
; // A slot is already reserved for us.
1663 U_ASSERT(theLoc
> 0);
1664 U_ASSERT(URX_TYPE(((uint32_t)fRXPat
->fCompiledPat
->elementAti(theLoc
))) == URX_NOP
);
1667 // Item just compiled is a single thing, a ".", or a single char, or a set reference.
1668 // No slot for STATE_SAVE was pre-reserved in the compiled code.
1669 // We need to make space now.
1670 fixLiterals(TRUE
); // If last item was a string, separate the last char.
1671 theLoc
= fRXPat
->fCompiledPat
->size()-1;
1673 /*int32_t opAtTheLoc = fRXPat->fCompiledPat->elementAti(theLoc);*/
1674 int32_t nop
= URX_BUILD(URX_NOP
, 0);
1675 fRXPat
->fCompiledPat
->insertElementAt(nop
, theLoc
, *fStatus
);
1683 //------------------------------------------------------------------------------
1685 // handleCloseParen When compiling a close paren, we need to go back
1686 // and fix up any JMP or SAVE operations within the
1687 // parenthesized block that need to target the end
1688 // of the block. The locations of these are kept on
1689 // the paretheses stack.
1691 // This function is called both when encountering a
1692 // real ) and at the end of the pattern.
1694 //-------------------------------------------------------------------------------
1695 void RegexCompile::handleCloseParen() {
1698 if (fParenStack
.size() <= 0) {
1699 error(U_REGEX_MISMATCHED_PAREN
);
1703 // Force any literal chars that may follow the close paren to start a new string,
1704 // and not attach to any preceding it.
1707 // Fixup any operations within the just-closed parenthesized group
1708 // that need to reference the end of the (block).
1709 // (The first one popped from the stack is an unused slot for
1710 // alternation (OR) state save, but applying the fixup to it does no harm.)
1712 patIdx
= fParenStack
.popi();
1714 // value < 0 flags the start of the frame on the paren stack.
1717 U_ASSERT(patIdx
>0 && patIdx
<= fRXPat
->fCompiledPat
->size());
1718 patOp
= fRXPat
->fCompiledPat
->elementAti(patIdx
);
1719 U_ASSERT(URX_VAL(patOp
) == 0); // Branch target for JMP should not be set.
1720 patOp
|= fRXPat
->fCompiledPat
->size(); // Set it now.
1721 fRXPat
->fCompiledPat
->setElementAt(patOp
, patIdx
);
1722 fMatchOpenParen
= patIdx
;
1725 // At the close of any parenthesized block, restore the match mode flags to
1726 // the value they had at the open paren. Saved value is
1727 // at the top of the paren stack.
1728 fModeFlags
= fParenStack
.popi();
1730 // DO any additional fixups, depending on the specific kind of
1731 // parentesized grouping this is
1736 // No additional fixups required.
1737 // (Grouping-only parentheses)
1740 // Capturing Parentheses.
1741 // Insert a End Capture op into the pattern.
1742 // The frame offset of the variables for this cg is obtained from the
1743 // start capture op and put it into the end-capture op.
1745 int32_t captureOp
= fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
+1);
1746 U_ASSERT(URX_TYPE(captureOp
) == URX_START_CAPTURE
);
1748 int32_t frameVarLocation
= URX_VAL(captureOp
);
1749 int32_t endCaptureOp
= URX_BUILD(URX_END_CAPTURE
, frameVarLocation
);
1750 fRXPat
->fCompiledPat
->addElement(endCaptureOp
, *fStatus
);
1754 // Atomic Parenthesis.
1755 // Insert a LD_SP operation to restore the state stack to the position
1756 // it was when the atomic parens were entered.
1758 int32_t stoOp
= fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
+1);
1759 U_ASSERT(URX_TYPE(stoOp
) == URX_STO_SP
);
1760 int32_t stoLoc
= URX_VAL(stoOp
);
1761 int32_t ldOp
= URX_BUILD(URX_LD_SP
, stoLoc
);
1762 fRXPat
->fCompiledPat
->addElement(ldOp
, *fStatus
);
1768 int32_t startOp
= fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
-1);
1769 U_ASSERT(URX_TYPE(startOp
) == URX_LA_START
);
1770 int32_t dataLoc
= URX_VAL(startOp
);
1771 int32_t op
= URX_BUILD(URX_LA_END
, dataLoc
);
1772 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1778 // See comment at doOpenLookAheadNeg
1779 int32_t startOp
= fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
-1);
1780 U_ASSERT(URX_TYPE(startOp
) == URX_LA_START
);
1781 int32_t dataLoc
= URX_VAL(startOp
);
1782 int32_t op
= URX_BUILD(URX_LA_END
, dataLoc
);
1783 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1784 op
= URX_BUILD(URX_FAIL
, 0);
1785 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1787 // Patch the URX_SAVE near the top of the block.
1788 int32_t saveOp
= fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
);
1789 U_ASSERT(URX_TYPE(saveOp
) == URX_STATE_SAVE
);
1790 int32_t dest
= fRXPat
->fCompiledPat
->size();
1791 saveOp
= URX_BUILD(URX_STATE_SAVE
, dest
);
1792 fRXPat
->fCompiledPat
->setElementAt(saveOp
, fMatchOpenParen
);
1798 // See comment at doOpenLookBehind.
1800 // Append the URX_LB_END and URX_LA_END to the compiled pattern.
1801 int32_t startOp
= fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
-4);
1802 U_ASSERT(URX_TYPE(startOp
) == URX_LB_START
);
1803 int32_t dataLoc
= URX_VAL(startOp
);
1804 int32_t op
= URX_BUILD(URX_LB_END
, dataLoc
);
1805 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1806 op
= URX_BUILD(URX_LA_END
, dataLoc
);
1807 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1809 // Determine the min and max bounds for the length of the
1810 // string that the pattern can match.
1811 // An unbounded upper limit is an error.
1812 int32_t patEnd
= fRXPat
->fCompiledPat
->size() - 1;
1813 int32_t minML
= minMatchLength(fMatchOpenParen
, patEnd
);
1814 int32_t maxML
= maxMatchLength(fMatchOpenParen
, patEnd
);
1815 if (maxML
== INT32_MAX
) {
1816 error(U_REGEX_LOOK_BEHIND_LIMIT
);
1819 U_ASSERT(minML
<= maxML
);
1821 // Insert the min and max match len bounds into the URX_LB_CONT op that
1822 // appears at the top of the look-behind block, at location fMatchOpenParen+1
1823 fRXPat
->fCompiledPat
->setElementAt(minML
, fMatchOpenParen
-2);
1824 fRXPat
->fCompiledPat
->setElementAt(maxML
, fMatchOpenParen
-1);
1833 // See comment at doOpenLookBehindNeg.
1835 // Append the URX_LBN_END to the compiled pattern.
1836 int32_t startOp
= fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
-5);
1837 U_ASSERT(URX_TYPE(startOp
) == URX_LB_START
);
1838 int32_t dataLoc
= URX_VAL(startOp
);
1839 int32_t op
= URX_BUILD(URX_LBN_END
, dataLoc
);
1840 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1842 // Determine the min and max bounds for the length of the
1843 // string that the pattern can match.
1844 // An unbounded upper limit is an error.
1845 int32_t patEnd
= fRXPat
->fCompiledPat
->size() - 1;
1846 int32_t minML
= minMatchLength(fMatchOpenParen
, patEnd
);
1847 int32_t maxML
= maxMatchLength(fMatchOpenParen
, patEnd
);
1848 if (maxML
== INT32_MAX
) {
1849 error(U_REGEX_LOOK_BEHIND_LIMIT
);
1852 U_ASSERT(minML
<= maxML
);
1854 // Insert the min and max match len bounds into the URX_LB_CONT op that
1855 // appears at the top of the look-behind block, at location fMatchOpenParen+1
1856 fRXPat
->fCompiledPat
->setElementAt(minML
, fMatchOpenParen
-3);
1857 fRXPat
->fCompiledPat
->setElementAt(maxML
, fMatchOpenParen
-2);
1859 // Insert the pattern location to continue at after a successful match
1860 // as the last operand of the URX_LBN_CONT
1861 op
= URX_BUILD(URX_RELOC_OPRND
, fRXPat
->fCompiledPat
->size());
1862 fRXPat
->fCompiledPat
->setElementAt(op
, fMatchOpenParen
-1);
1872 // remember the next location in the compiled pattern.
1873 // The compilation of Quantifiers will look at this to see whether its looping
1874 // over a parenthesized block or a single item
1875 fMatchCloseParen
= fRXPat
->fCompiledPat
->size();
1880 //----------------------------------------------------------------------------------------
1882 // compileSet Compile the pattern operations for a reference to a
1885 //----------------------------------------------------------------------------------------
1886 void RegexCompile::compileSet(UnicodeSet
*theSet
)
1888 if (theSet
== NULL
) {
1891 int32_t setSize
= theSet
->size();
1892 UChar32 firstSetChar
= theSet
->charAt(0);
1893 if (firstSetChar
== -1) {
1894 // Sets that contain only strings, but no individual chars,
1895 // will end up here.
1896 error(U_REGEX_SET_CONTAINS_STRING
);
1903 // Set of no elements. Always fails to match.
1904 fRXPat
->fCompiledPat
->addElement(URX_BUILD(URX_BACKTRACK
, 0), *fStatus
);
1911 // The set contains only a single code point. Put it into
1912 // the compiled pattern as a single char operation rather
1913 // than a set, and discard the set itself.
1914 literalChar(firstSetChar
);
1921 // The set contains two or more chars. (the normal case)
1922 // Put it into the compiled pattern as a set.
1923 int32_t setNumber
= fRXPat
->fSets
->size();
1924 fRXPat
->fSets
->addElement(theSet
, *fStatus
);
1925 int32_t setOp
= URX_BUILD(URX_SETREF
, setNumber
);
1926 fRXPat
->fCompiledPat
->addElement(setOp
, *fStatus
);
1932 //----------------------------------------------------------------------------------------
1934 // compileInterval Generate the code for a {min, max} style interval quantifier.
1935 // Except for the specific opcodes used, the code is the same
1936 // for all three types (greedy, non-greedy, possessive) of
1937 // intervals. The opcodes are supplied as parameters.
1939 // The code for interval loops has this form:
1940 // 0 CTR_INIT counter loc (in stack frame)
1941 // 1 5 patt address of CTR_LOOP at bottom of block
1943 // 3 max count (-1 for unbounded)
1944 // 4 ... block to be iterated over
1948 //----------------------------------------------------------------------------------------
1949 void RegexCompile::compileInterval(int32_t InitOp
, int32_t LoopOp
)
1951 // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
1952 // four slots in the compiled code. Reserve them.
1953 int32_t topOfBlock
= blockTopLoc(TRUE
);
1954 insertOp(topOfBlock
);
1955 insertOp(topOfBlock
);
1956 insertOp(topOfBlock
);
1958 // The operands for the CTR_INIT opcode include the index in the matcher data
1959 // of the counter. Allocate it now.
1960 int32_t counterLoc
= fRXPat
->fFrameSize
;
1961 fRXPat
->fFrameSize
++;
1963 int32_t op
= URX_BUILD(InitOp
, counterLoc
);
1964 fRXPat
->fCompiledPat
->setElementAt(op
, topOfBlock
);
1966 // The second operand of CTR_INIT is the location following the end of the loop.
1967 // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
1968 // compilation of something later on causes the code to grow and the target
1969 // position to move.
1970 int32_t loopEnd
= fRXPat
->fCompiledPat
->size();
1971 op
= URX_BUILD(URX_RELOC_OPRND
, loopEnd
);
1972 fRXPat
->fCompiledPat
->setElementAt(op
, topOfBlock
+1);
1974 // Followed by the min and max counts.
1975 fRXPat
->fCompiledPat
->setElementAt(fIntervalLow
, topOfBlock
+2);
1976 fRXPat
->fCompiledPat
->setElementAt(fIntervalUpper
, topOfBlock
+3);
1978 // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op.
1979 // Goes at end of the block being looped over, so just append to the code so far.
1980 op
= URX_BUILD(LoopOp
, topOfBlock
);
1981 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1983 if ((fIntervalLow
& 0xff000000) != 0 ||
1984 fIntervalUpper
> 0 && (fIntervalUpper
& 0xff000000) != 0) {
1985 error(U_REGEX_NUMBER_TOO_BIG
);
1988 if (fIntervalLow
> fIntervalUpper
&& fIntervalUpper
!= -1) {
1989 error(U_REGEX_MAX_LT_MIN
);
1995 UBool
RegexCompile::compileInlineInterval() {
1996 if (fIntervalUpper
> 10 || fIntervalUpper
< fIntervalLow
) {
1997 // Too big to inline. Fail, which will cause looping code to be generated.
1998 // (Upper < Lower picks up unbounded upper and errors, both.)
2002 int32_t topOfBlock
= blockTopLoc(FALSE
);
2003 if (fIntervalUpper
== 0) {
2004 // Pathological case. Attempt no matches, as if the block doesn't exist.
2005 fRXPat
->fCompiledPat
->setSize(topOfBlock
);
2009 if (topOfBlock
!= fRXPat
->fCompiledPat
->size()-1 && fIntervalUpper
!= 1) {
2010 // The thing being repeated is not a single op, but some
2011 // more complex block. Do it as a loop, not inlines.
2012 // Note that things "repeated" a max of once are handled as inline, because
2013 // the one copy of the code already generated is just fine.
2017 // Pick up the opcode that is to be repeated
2019 int32_t op
= fRXPat
->fCompiledPat
->elementAti(topOfBlock
);
2021 // Compute the pattern location where the inline sequence
2022 // will end, and set up the state save op that will be needed.
2024 int32_t endOfSequenceLoc
= fRXPat
->fCompiledPat
->size()-1
2025 + fIntervalUpper
+ (fIntervalUpper
-fIntervalLow
);
2026 int32_t saveOp
= URX_BUILD(URX_STATE_SAVE
, endOfSequenceLoc
);
2027 if (fIntervalLow
== 0) {
2028 insertOp(topOfBlock
);
2029 fRXPat
->fCompiledPat
->setElementAt(saveOp
, topOfBlock
);
2034 // Loop, emitting the op for the thing being repeated each time.
2035 // Loop starts at 1 because one instance of the op already exists in the pattern,
2036 // it was put there when it was originally encountered.
2038 for (i
=1; i
<fIntervalUpper
; i
++ ) {
2039 if (i
== fIntervalLow
) {
2040 fRXPat
->fCompiledPat
->addElement(saveOp
, *fStatus
);
2042 if (i
> fIntervalLow
) {
2043 fRXPat
->fCompiledPat
->addElement(saveOp
, *fStatus
);
2045 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
2052 //----------------------------------------------------------------------------------------
2054 // matchStartType Determine how a match can start.
2055 // Used to optimize find() operations.
2057 // Operation is very similar to minMatchLength(). Walk the compiled
2058 // pattern, keeping an on-going minimum-match-length. For any
2059 // op where the min match coming in is zero, add that ops possible
2060 // starting matches to the possible starts for the overall pattern.
2062 //----------------------------------------------------------------------------------------
2063 void RegexCompile::matchStartType() {
2064 if (U_FAILURE(*fStatus
)) {
2069 int32_t loc
; // Location in the pattern of the current op being processed.
2070 int32_t op
; // The op being processed
2071 int32_t opType
; // The opcode type of the op
2072 int32_t currentLen
= 0; // Minimum length of a match to this point (loc) in the pattern
2073 int32_t numInitialStrings
= 0; // Number of strings encountered that could match at start.
2075 UBool atStart
= TRUE
; // True if no part of the pattern yet encountered
2076 // could have advanced the position in a match.
2077 // (Maximum match length so far == 0)
2079 // forwardedLength is a vector holding minimum-match-length values that
2080 // are propagated forward in the pattern by JMP or STATE_SAVE operations.
2081 // It must be one longer than the pattern being checked because some ops
2082 // will jmp to a end-of-block+1 location from within a block, and we must
2083 // count those when checking the block.
2084 int32_t end
= fRXPat
->fCompiledPat
->size();
2085 UVector32
forwardedLength(end
+1, *fStatus
);
2086 forwardedLength
.setSize(end
+1);
2087 for (loc
=3; loc
<end
; loc
++) {
2088 forwardedLength
.setElementAt(INT32_MAX
, loc
);
2091 for (loc
= 3; loc
<end
; loc
++) {
2092 op
= fRXPat
->fCompiledPat
->elementAti(loc
);
2093 opType
= URX_TYPE(op
);
2095 // The loop is advancing linearly through the pattern.
2096 // If the op we are now at was the destination of a branch in the pattern,
2097 // and that path has a shorter minimum length than the current accumulated value,
2098 // replace the current accumulated value.
2099 U_ASSERT(currentLen
>=0 && currentLen
< INT32_MAX
);
2100 if (forwardedLength
.elementAti(loc
) < currentLen
) {
2101 currentLen
= forwardedLength
.elementAti(loc
);
2102 U_ASSERT(currentLen
>=0 && currentLen
< INT32_MAX
);
2106 // Ops that don't change the total length matched
2107 case URX_RESERVED_OP
:
2109 case URX_STRING_LEN
:
2111 case URX_START_CAPTURE
:
2112 case URX_END_CAPTURE
:
2113 case URX_BACKSLASH_B
:
2114 case URX_BACKSLASH_BU
:
2115 case URX_BACKSLASH_G
:
2116 case URX_BACKSLASH_Z
:
2118 case URX_RELOC_OPRND
:
2119 case URX_STO_INP_LOC
:
2122 case URX_BACKREF
: // BackRef. Must assume that it might be a zero length match
2125 case URX_STO_SP
: // Setup for atomic or possessive blocks. Doesn't change what can match.
2131 fRXPat
->fStartType
= START_START
;
2137 fRXPat
->fStartType
= START_LINE
;
2142 if (currentLen
== 0) {
2143 // This character could appear at the start of a match.
2144 // Add it to the set of possible starting characters.
2145 fRXPat
->fInitialChars
->add(URX_VAL(op
));
2146 numInitialStrings
+= 2;
2154 if (currentLen
== 0) {
2155 int32_t sn
= URX_VAL(op
);
2156 U_ASSERT(sn
> 0 && sn
< fRXPat
->fSets
->size());
2157 const UnicodeSet
*s
= (UnicodeSet
*)fRXPat
->fSets
->elementAt(sn
);
2158 fRXPat
->fInitialChars
->addAll(*s
);
2159 numInitialStrings
+= 2;
2166 // [Set]*, like a SETREF, above, in what it can match,
2167 // but may not match at all, so currentLen is not incremented.
2168 if (currentLen
== 0) {
2169 int32_t sn
= URX_VAL(op
);
2170 U_ASSERT(sn
> 0 && sn
< fRXPat
->fSets
->size());
2171 const UnicodeSet
*s
= (UnicodeSet
*)fRXPat
->fSets
->elementAt(sn
);
2172 fRXPat
->fInitialChars
->addAll(*s
);
2173 numInitialStrings
+= 2;
2178 case URX_LOOP_DOT_I
:
2179 if (currentLen
== 0) {
2180 // .* at the start of a pattern.
2181 // Any character can begin the match.
2182 fRXPat
->fInitialChars
->clear();
2183 fRXPat
->fInitialChars
->complement();
2184 numInitialStrings
+= 2;
2190 case URX_STATIC_SETREF
:
2191 if (currentLen
== 0) {
2192 int32_t sn
= URX_VAL(op
);
2193 U_ASSERT(sn
>0 && sn
<URX_LAST_SET
);
2194 const UnicodeSet
*s
= fRXPat
->fStaticSets
[sn
];
2195 fRXPat
->fInitialChars
->addAll(*s
);
2196 numInitialStrings
+= 2;
2204 case URX_STAT_SETREF_N
:
2205 if (currentLen
== 0) {
2206 int32_t sn
= URX_VAL(op
);
2207 const UnicodeSet
*s
= fRXPat
->fStaticSets
[sn
];
2210 fRXPat
->fInitialChars
->addAll(sc
);
2211 numInitialStrings
+= 2;
2219 case URX_BACKSLASH_D
:
2221 if (currentLen
== 0) {
2223 s
.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK
, U_GC_ND_MASK
, *fStatus
);
2224 if (URX_VAL(op
) != 0) {
2227 fRXPat
->fInitialChars
->addAll(s
);
2228 numInitialStrings
+= 2;
2236 // Case Insensitive Single Character.
2237 if (currentLen
== 0) {
2238 UChar32 c
= URX_VAL(op
);
2239 if (u_hasBinaryProperty(c
, UCHAR_CASE_SENSITIVE
)) {
2240 // character may have distinct cased forms. Add all of them
2241 // to the set of possible starting match chars.
2243 s
.closeOver(USET_CASE
);
2244 fRXPat
->fInitialChars
->addAll(s
);
2246 // Char has no case variants. Just add it as-is to the
2247 // set of possible starting chars.
2248 fRXPat
->fInitialChars
->add(c
);
2250 numInitialStrings
+= 2;
2257 case URX_BACKSLASH_X
: // Grahpeme Cluster. Minimum is 1, max unbounded.
2258 case URX_DOTANY_ALL
: // . matches one or two.
2260 case URX_DOTANY_ALL_PL
:
2262 if (currentLen
== 0) {
2263 // These constructs are all bad news when they appear at the start
2264 // of a match. Any character can begin the match.
2265 fRXPat
->fInitialChars
->clear();
2266 fRXPat
->fInitialChars
->complement();
2267 numInitialStrings
+= 2;
2275 loc
++; // Except for extra operand on URX_JMPX, same as URX_JMP.
2278 int32_t jmpDest
= URX_VAL(op
);
2279 if (jmpDest
< loc
) {
2280 // Loop of some kind. Can safely ignore, the worst that will happen
2281 // is that we understate the true minimum length
2282 currentLen
= forwardedLength
.elementAti(loc
+1);
2285 // Forward jump. Propagate the current min length to the target loc of the jump.
2286 U_ASSERT(jmpDest
<= end
+1);
2287 if (forwardedLength
.elementAti(jmpDest
) > currentLen
) {
2288 forwardedLength
.setElementAt(currentLen
, jmpDest
);
2297 // Combo of state save to the next loc, + jmp backwards.
2298 // Net effect on min. length computation is nothing.
2303 // Fails are kind of like a branch, except that the min length was
2304 // propagated already, by the state save.
2305 currentLen
= forwardedLength
.elementAti(loc
+1);
2310 case URX_STATE_SAVE
:
2312 // State Save, for forward jumps, propagate the current minimum.
2313 // of the state save.
2314 int32_t jmpDest
= URX_VAL(op
);
2315 if (jmpDest
> loc
) {
2316 if (currentLen
< forwardedLength
.elementAti(jmpDest
)) {
2317 forwardedLength
.setElementAt(currentLen
, jmpDest
);
2330 int32_t stringLenOp
= fRXPat
->fCompiledPat
->elementAti(loc
);
2331 int32_t stringLen
= URX_VAL(stringLenOp
);
2332 U_ASSERT(URX_TYPE(stringLenOp
) == URX_STRING_LEN
);
2333 U_ASSERT(stringLenOp
>= 2);
2334 if (currentLen
== 0) {
2335 // Add the starting character of this string to the set of possible starting
2336 // characters for this pattern.
2337 int32_t stringStartIdx
= URX_VAL(op
);
2338 UChar32 c
= fRXPat
->fLiteralText
.char32At(stringStartIdx
);
2339 fRXPat
->fInitialChars
->add(c
);
2341 // Remember this string. After the entire pattern has been checked,
2342 // if nothing else is identified that can start a match, we'll use it.
2343 numInitialStrings
++;
2344 fRXPat
->fInitialStringIdx
= stringStartIdx
;
2345 fRXPat
->fInitialStringLen
= stringLen
;
2348 currentLen
+= stringLen
;
2355 // Case-insensitive string. Unlike exact-match strings, we won't
2356 // attempt a string search for possible match positions. But we
2357 // do update the set of possible starting characters.
2359 int32_t stringLenOp
= fRXPat
->fCompiledPat
->elementAti(loc
);
2360 int32_t stringLen
= URX_VAL(stringLenOp
);
2361 U_ASSERT(URX_TYPE(stringLenOp
) == URX_STRING_LEN
);
2362 U_ASSERT(stringLenOp
>= 2);
2363 if (currentLen
== 0) {
2364 // Add the starting character of this string to the set of possible starting
2365 // characters for this pattern.
2366 int32_t stringStartIdx
= URX_VAL(op
);
2367 UChar32 c
= fRXPat
->fLiteralText
.char32At(stringStartIdx
);
2369 s
.closeOver(USET_CASE
);
2370 fRXPat
->fInitialChars
->addAll(s
);
2371 numInitialStrings
+= 2; // Matching on an initial string not possible.
2373 currentLen
+= stringLen
;
2379 case URX_CTR_INIT_NG
:
2381 // Loop Init Ops. These don't change the min length, but they are 4 word ops
2382 // so location must be updated accordingly.
2384 // If the min loop count == 0
2385 // move loc forwards to the end of the loop, skipping over the body.
2386 // If the min count is > 0,
2387 // continue normal processing of the body of the loop.
2388 int32_t loopEndLoc
= fRXPat
->fCompiledPat
->elementAti(loc
+1);
2389 loopEndLoc
= URX_VAL(loopEndLoc
);
2390 int32_t minLoopCount
= fRXPat
->fCompiledPat
->elementAti(loc
+2);
2391 if (minLoopCount
== 0) {
2392 // Min Loop Count of 0, treat like a forward branch and
2393 // move the current minimum length up to the target
2394 // (end of loop) location.
2395 U_ASSERT(loopEndLoc
<= end
+1);
2396 if (forwardedLength
.elementAti(loopEndLoc
) > currentLen
) {
2397 forwardedLength
.setElementAt(currentLen
, loopEndLoc
);
2400 loc
+=3; // Skips over operands of CTR_INIT
2407 case URX_CTR_LOOP_NG
:
2409 // The jump is conditional, backwards only.
2414 // More loop ops. These state-save to themselves.
2415 // don't change the minimum match
2423 // Look-around. Scan forward until the matching look-ahead end,
2424 // without processing the look-around block. This is overly pessimistic.
2428 op
= fRXPat
->fCompiledPat
->elementAti(loc
);
2429 if (URX_TYPE(op
) == URX_LA_START
|| URX_TYPE(op
) == URX_LB_START
) {
2432 if (URX_TYPE(op
) == URX_LA_END
|| URX_TYPE(op
)==URX_LBN_END
) {
2438 if (URX_TYPE(op
) == URX_STATE_SAVE
) {
2439 // Need this because neg lookahead blocks will FAIL to outside
2441 int32_t jmpDest
= URX_VAL(op
);
2442 if (jmpDest
> loc
) {
2443 if (currentLen
< forwardedLength
.elementAti(jmpDest
)) {
2444 forwardedLength
.setElementAt(currentLen
, jmpDest
);
2448 U_ASSERT(loc
<= end
);
2458 U_ASSERT(FALSE
); // Shouldn't get here. These ops should be
2459 // consumed by the scan in URX_LA_START and LB_START
2470 // We have finished walking through the ops. Check whether some forward jump
2471 // propagated a shorter length to location end+1.
2472 if (forwardedLength
.elementAti(end
+1) < currentLen
) {
2473 currentLen
= forwardedLength
.elementAti(end
+1);
2477 fRXPat
->fInitialChars8
->init(fRXPat
->fInitialChars
);
2480 // Sort out what we should check for when looking for candidate match start positions.
2481 // In order of preference,
2482 // 1. Start of input text buffer.
2483 // 2. A literal string.
2484 // 3. Start of line in multi-line mode.
2485 // 4. A single literal character.
2486 // 5. A character from a set of characters.
2488 if (fRXPat
->fStartType
== START_START
) {
2489 // Match only at the start of an input text string.
2490 // start type is already set. We're done.
2491 } else if (numInitialStrings
== 1 && fRXPat
->fMinMatchLen
> 0) {
2492 // Match beginning only with a literal string.
2493 UChar32 c
= fRXPat
->fLiteralText
.char32At(fRXPat
->fInitialStringIdx
);
2494 U_ASSERT(fRXPat
->fInitialChars
->contains(c
));
2495 fRXPat
->fStartType
= START_STRING
;
2496 fRXPat
->fInitialChar
= c
;
2497 } else if (fRXPat
->fStartType
== START_LINE
) {
2498 // Match at start of line in Multi-Line mode.
2499 // Nothing to do here; everything is already set.
2500 } else if (fRXPat
->fMinMatchLen
== 0) {
2501 // Zero length match possible. We could start anywhere.
2502 fRXPat
->fStartType
= START_NO_INFO
;
2503 } else if (fRXPat
->fInitialChars
->size() == 1) {
2504 // All matches begin with the same char.
2505 fRXPat
->fStartType
= START_CHAR
;
2506 fRXPat
->fInitialChar
= fRXPat
->fInitialChars
->charAt(0);
2507 U_ASSERT(fRXPat
->fInitialChar
!= (UChar32
)-1);
2508 } else if (fRXPat
->fInitialChars
->contains((UChar32
)0, (UChar32
)0x10ffff) == FALSE
&&
2509 fRXPat
->fMinMatchLen
> 0) {
2510 // Matches start with a set of character smaller than the set of all chars.
2511 fRXPat
->fStartType
= START_SET
;
2513 // Matches can start with anything
2514 fRXPat
->fStartType
= START_NO_INFO
;
2522 //----------------------------------------------------------------------------------------
2524 // minMatchLength Calculate the length of the shortest string that could
2525 // match the specified pattern.
2526 // Length is in 16 bit code units, not code points.
2528 // The calculated length may not be exact. The returned
2529 // value may be shorter than the actual minimum; it must
2532 // start and end are the range of p-code operations to be
2533 // examined. The endpoints are included in the range.
2535 //----------------------------------------------------------------------------------------
2536 int32_t RegexCompile::minMatchLength(int32_t start
, int32_t end
) {
2537 if (U_FAILURE(*fStatus
)) {
2541 U_ASSERT(start
<= end
);
2542 U_ASSERT(end
< fRXPat
->fCompiledPat
->size());
2548 int32_t currentLen
= 0;
2551 // forwardedLength is a vector holding minimum-match-length values that
2552 // are propagated forward in the pattern by JMP or STATE_SAVE operations.
2553 // It must be one longer than the pattern being checked because some ops
2554 // will jmp to a end-of-block+1 location from within a block, and we must
2555 // count those when checking the block.
2556 UVector32
forwardedLength(end
+2, *fStatus
);
2557 forwardedLength
.setSize(end
+2);
2558 for (loc
=start
; loc
<=end
+1; loc
++) {
2559 forwardedLength
.setElementAt(INT32_MAX
, loc
);
2562 for (loc
= start
; loc
<=end
; loc
++) {
2563 op
= fRXPat
->fCompiledPat
->elementAti(loc
);
2564 opType
= URX_TYPE(op
);
2566 // The loop is advancing linearly through the pattern.
2567 // If the op we are now at was the destination of a branch in the pattern,
2568 // and that path has a shorter minimum length than the current accumulated value,
2569 // replace the current accumulated value.
2570 U_ASSERT(currentLen
>=0 && currentLen
< INT32_MAX
);
2571 if (forwardedLength
.elementAti(loc
) < currentLen
) {
2572 currentLen
= forwardedLength
.elementAti(loc
);
2573 U_ASSERT(currentLen
>=0 && currentLen
< INT32_MAX
);
2577 // Ops that don't change the total length matched
2578 case URX_RESERVED_OP
:
2580 case URX_STRING_LEN
:
2582 case URX_START_CAPTURE
:
2583 case URX_END_CAPTURE
:
2584 case URX_BACKSLASH_B
:
2585 case URX_BACKSLASH_BU
:
2586 case URX_BACKSLASH_G
:
2587 case URX_BACKSLASH_Z
:
2590 case URX_RELOC_OPRND
:
2591 case URX_STO_INP_LOC
:
2595 case URX_BACKREF
: // BackRef. Must assume that it might be a zero length match
2598 case URX_STO_SP
: // Setup for atomic or possessive blocks. Doesn't change what can match.
2606 // Ops that match a minimum of one character (one or two 16 bit code units.)
2609 case URX_STATIC_SETREF
:
2610 case URX_STAT_SETREF_N
:
2612 case URX_BACKSLASH_D
:
2614 case URX_BACKSLASH_X
: // Grahpeme Cluster. Minimum is 1, max unbounded.
2615 case URX_DOTANY_ALL
: // . matches one or two.
2618 case URX_DOTANY_ALL_PL
:
2624 loc
++; // URX_JMPX has an extra operand, ignored here,
2625 // otherwise processed identically to URX_JMP.
2628 int32_t jmpDest
= URX_VAL(op
);
2629 if (jmpDest
< loc
) {
2630 // Loop of some kind. Can safely ignore, the worst that will happen
2631 // is that we understate the true minimum length
2632 currentLen
= forwardedLength
.elementAti(loc
+1);
2634 // Forward jump. Propagate the current min length to the target loc of the jump.
2635 U_ASSERT(jmpDest
<= end
+1);
2636 if (forwardedLength
.elementAti(jmpDest
) > currentLen
) {
2637 forwardedLength
.setElementAt(currentLen
, jmpDest
);
2645 // Fails are kind of like a branch, except that the min length was
2646 // propagated already, by the state save.
2647 currentLen
= forwardedLength
.elementAti(loc
+1);
2648 U_ASSERT(currentLen
>=0 && currentLen
< INT32_MAX
);
2653 case URX_STATE_SAVE
:
2655 // State Save, for forward jumps, propagate the current minimum.
2656 // of the state save.
2657 int32_t jmpDest
= URX_VAL(op
);
2658 if (jmpDest
> loc
) {
2659 if (currentLen
< forwardedLength
.elementAti(jmpDest
)) {
2660 forwardedLength
.setElementAt(currentLen
, jmpDest
);
2671 int32_t stringLenOp
= fRXPat
->fCompiledPat
->elementAti(loc
);
2672 currentLen
+= URX_VAL(stringLenOp
);
2678 case URX_CTR_INIT_NG
:
2681 // If the min loop count == 0
2682 // move loc forwards to the end of the loop, skipping over the body.
2683 // If the min count is > 0,
2684 // continue normal processing of the body of the loop.
2685 int32_t loopEndLoc
= fRXPat
->fCompiledPat
->elementAti(loc
+1);
2686 loopEndLoc
= URX_VAL(loopEndLoc
);
2687 int32_t minLoopCount
= fRXPat
->fCompiledPat
->elementAti(loc
+2);
2688 if (minLoopCount
== 0) {
2691 loc
+=3; // Skips over operands of CTR_INIT
2698 case URX_CTR_LOOP_NG
:
2700 // The jump is conditional, backwards only.
2704 case URX_LOOP_DOT_I
:
2706 // More loop ops. These state-save to themselves.
2707 // don't change the minimum match - could match nothing at all.
2714 // Look-around. Scan forward until the matching look-ahead end,
2715 // without processing the look-around block. This is overly pessimistic.
2716 // TODO: Positive lookahead could recursively do the block, then continue
2717 // with the longer of the block or the value coming in.
2721 op
= fRXPat
->fCompiledPat
->elementAti(loc
);
2722 if (URX_TYPE(op
) == URX_LA_START
|| URX_TYPE(op
) == URX_LB_START
) {
2725 if (URX_TYPE(op
) == URX_LA_END
|| URX_TYPE(op
)==URX_LBN_END
) {
2731 if (URX_TYPE(op
) == URX_STATE_SAVE
) {
2732 // Need this because neg lookahead blocks will FAIL to outside
2734 int32_t jmpDest
= URX_VAL(op
);
2735 if (jmpDest
> loc
) {
2736 if (currentLen
< forwardedLength
.elementAti(jmpDest
)) {
2737 forwardedLength
.setElementAt(currentLen
, jmpDest
);
2742 U_ASSERT(loc
<= end
);
2752 // Only come here if the matching URX_LA_START or URX_LB_START was not in the
2753 // range being sized, which happens when measuring size of look-behind blocks.
2762 // We have finished walking through the ops. Check whether some forward jump
2763 // propagated a shorter length to location end+1.
2764 if (forwardedLength
.elementAti(end
+1) < currentLen
) {
2765 currentLen
= forwardedLength
.elementAti(end
+1);
2766 U_ASSERT(currentLen
>=0 && currentLen
< INT32_MAX
);
2774 //----------------------------------------------------------------------------------------
2776 // maxMatchLength Calculate the length of the longest string that could
2777 // match the specified pattern.
2778 // Length is in 16 bit code units, not code points.
2780 // The calculated length may not be exact. The returned
2781 // value may be longer than the actual maximum; it must
2782 // never be shorter.
2784 //----------------------------------------------------------------------------------------
2785 int32_t RegexCompile::maxMatchLength(int32_t start
, int32_t end
) {
2786 if (U_FAILURE(*fStatus
)) {
2789 U_ASSERT(start
<= end
);
2790 U_ASSERT(end
< fRXPat
->fCompiledPat
->size());
2796 int32_t currentLen
= 0;
2797 UVector32
forwardedLength(end
+1, *fStatus
);
2798 forwardedLength
.setSize(end
+1);
2800 for (loc
=start
; loc
<=end
; loc
++) {
2801 forwardedLength
.setElementAt(0, loc
);
2804 for (loc
= start
; loc
<=end
; loc
++) {
2805 op
= fRXPat
->fCompiledPat
->elementAti(loc
);
2806 opType
= URX_TYPE(op
);
2808 // The loop is advancing linearly through the pattern.
2809 // If the op we are now at was the destination of a branch in the pattern,
2810 // and that path has a longer maximum length than the current accumulated value,
2811 // replace the current accumulated value.
2812 if (forwardedLength
.elementAti(loc
) > currentLen
) {
2813 currentLen
= forwardedLength
.elementAti(loc
);
2817 // Ops that don't change the total length matched
2818 case URX_RESERVED_OP
:
2820 case URX_STRING_LEN
:
2822 case URX_START_CAPTURE
:
2823 case URX_END_CAPTURE
:
2824 case URX_BACKSLASH_B
:
2825 case URX_BACKSLASH_BU
:
2826 case URX_BACKSLASH_G
:
2827 case URX_BACKSLASH_Z
:
2830 case URX_RELOC_OPRND
:
2831 case URX_STO_INP_LOC
:
2836 case URX_STO_SP
: // Setup for atomic or possessive blocks. Doesn't change what can match.
2846 // Ops that increase that cause an unbounded increase in the length
2847 // of a matched string, or that increase it a hard to characterize way.
2848 // Call the max length unbounded, and stop further checking.
2849 case URX_BACKREF
: // BackRef. Must assume that it might be a zero length match
2851 case URX_BACKSLASH_X
: // Grahpeme Cluster. Minimum is 1, max unbounded.
2853 case URX_DOTANY_ALL_PL
:
2854 currentLen
= INT32_MAX
;
2858 // Ops that match a max of one character (possibly two 16 bit code units.)
2860 case URX_STATIC_SETREF
:
2861 case URX_STAT_SETREF_N
:
2863 case URX_BACKSLASH_D
:
2865 case URX_DOTANY_ALL
:
2870 // Single literal character. Increase current max length by one or two,
2871 // depending on whether the char is in the supplementary range.
2874 if (URX_VAL(op
) > 0x10000) {
2886 int32_t jmpDest
= URX_VAL(op
);
2887 if (jmpDest
< loc
) {
2888 // Loop of some kind. Max match length is unbounded.
2889 currentLen
= INT32_MAX
;
2891 // Forward jump. Propagate the current min length to the target loc of the jump.
2892 if (forwardedLength
.elementAti(jmpDest
) < currentLen
) {
2893 forwardedLength
.setElementAt(currentLen
, jmpDest
);
2901 // Fails are kind of like a branch, except that the max length was
2902 // propagated already, by the state save.
2903 currentLen
= forwardedLength
.elementAti(loc
+1);
2907 case URX_STATE_SAVE
:
2909 // State Save, for forward jumps, propagate the current minimum.
2910 // of the state save.
2911 // For backwards jumps, they create a loop, maximum
2912 // match length is unbounded.
2913 int32_t jmpDest
= URX_VAL(op
);
2914 if (jmpDest
> loc
) {
2915 if (currentLen
> forwardedLength
.elementAti(jmpDest
)) {
2916 forwardedLength
.setElementAt(currentLen
, jmpDest
);
2919 currentLen
= INT32_MAX
;
2931 int32_t stringLenOp
= fRXPat
->fCompiledPat
->elementAti(loc
);
2932 currentLen
+= URX_VAL(stringLenOp
);
2938 case URX_CTR_INIT_NG
:
2940 case URX_CTR_LOOP_NG
:
2942 case URX_LOOP_DOT_I
:
2944 // For anything to do with loops, make the match length unbounded.
2945 // Note: INIT instructions are multi-word. Can ignore because
2946 // INT32_MAX length will stop the per-instruction loop.
2947 currentLen
= INT32_MAX
;
2954 // Look-ahead. Just ignore, treat the look-ahead block as if
2955 // it were normal pattern. Gives a too-long match length,
2956 // but good enough for now.
2959 // End of look-ahead ops should always be consumed by the processing at
2960 // the URX_LA_START op.
2966 // Look-behind. Scan forward until the matching look-around end,
2967 // without processing the look-behind block.
2971 op
= fRXPat
->fCompiledPat
->elementAti(loc
);
2972 if (URX_TYPE(op
) == URX_LA_START
|| URX_TYPE(op
) == URX_LB_START
) {
2975 if (URX_TYPE(op
) == URX_LA_END
|| URX_TYPE(op
)==URX_LBN_END
) {
2981 U_ASSERT(loc
< end
);
2991 if (currentLen
== INT32_MAX
) {
2992 // The maximum length is unbounded.
2993 // Stop further processing of the pattern.
3003 //----------------------------------------------------------------------------------------
3005 // stripNOPs Remove any NOP operations from the compiled pattern code.
3006 // Extra NOPs are inserted for some constructs during the initial
3007 // code generation to provide locations that may be patched later.
3008 // Many end up unneeded, and are removed by this function.
3010 //----------------------------------------------------------------------------------------
3011 void RegexCompile::stripNOPs() {
3013 if (U_FAILURE(*fStatus
)) {
3017 int32_t end
= fRXPat
->fCompiledPat
->size();
3018 UVector32
deltas(end
, *fStatus
);
3020 // Make a first pass over the code, computing the amount that things
3021 // will be offset at each location in the original code.
3024 for (loc
=0; loc
<end
; loc
++) {
3025 deltas
.addElement(d
, *fStatus
);
3026 int32_t op
= fRXPat
->fCompiledPat
->elementAti(loc
);
3027 if (URX_TYPE(op
) == URX_NOP
) {
3032 // Make a second pass over the code, removing the NOPs by moving following
3033 // code up, and patching operands that refer to code locations that
3034 // are being moved. The array of offsets from the first step is used
3035 // to compute the new operand values.
3038 for (src
=0; src
<end
; src
++) {
3039 int32_t op
= fRXPat
->fCompiledPat
->elementAti(src
);
3040 int32_t opType
= URX_TYPE(op
);
3045 case URX_STATE_SAVE
:
3048 case URX_CTR_LOOP_NG
:
3049 case URX_RELOC_OPRND
:
3053 // These are instructions with operands that refer to code locations.
3055 int32_t operandAddress
= URX_VAL(op
);
3056 U_ASSERT(operandAddress
>=0 && operandAddress
<deltas
.size());
3057 int32_t fixedOperandAddress
= operandAddress
- deltas
.elementAti(operandAddress
);
3058 op
= URX_BUILD(opType
, fixedOperandAddress
);
3059 fRXPat
->fCompiledPat
->setElementAt(op
, dst
);
3064 case URX_RESERVED_OP
:
3065 case URX_RESERVED_OP_N
:
3070 case URX_STRING_LEN
:
3071 case URX_START_CAPTURE
:
3072 case URX_END_CAPTURE
:
3073 case URX_STATIC_SETREF
:
3074 case URX_STAT_SETREF_N
:
3078 case URX_BACKSLASH_B
:
3079 case URX_BACKSLASH_BU
:
3080 case URX_BACKSLASH_G
:
3081 case URX_BACKSLASH_X
:
3082 case URX_BACKSLASH_Z
:
3083 case URX_DOTANY_ALL
:
3084 case URX_DOTANY_ALL_PL
:
3086 case URX_BACKSLASH_D
:
3090 case URX_CTR_INIT_NG
:
3094 case URX_STO_INP_LOC
:
3108 case URX_LOOP_DOT_I
:
3110 // These instructions are unaltered by the relocation.
3111 fRXPat
->fCompiledPat
->setElementAt(op
, dst
);
3116 // Some op is unaccounted for.
3118 error(U_REGEX_INTERNAL_ERROR
);
3122 fRXPat
->fCompiledPat
->setSize(dst
);
3128 //----------------------------------------------------------------------------------------
3130 // OptDotStar Optimize patterns that end with a '.*' or '.+' to
3131 // just advance the input to the end.
3133 // Transform this compiled sequence
3134 // [DOT_ANY | DOT_ANY_ALL]
3135 // JMP_SAV to previous instruction
3136 // [NOP | END_CAPTURE | DOLLAR | BACKSLASH_Z]*
3141 // [DOT_ANY_PL | DOT_ANY_ALL_PL]
3142 // [NOP | END_CAPTURE | DOLLAR | BACKSLASH_Z]*
3145 //----------------------------------------------------------------------------------------
3146 void RegexCompile::OptDotStar() {
3147 // Scan backwards in the pattern, looking for a JMP_SAV near the end.
3151 for (jmpLoc
=fRXPat
->fCompiledPat
->size(); jmpLoc
--;) {
3153 op
= fRXPat
->fCompiledPat
->elementAti(jmpLoc
);
3154 opType
= URX_TYPE(op
);
3160 case URX_END_CAPTURE
:
3163 case URX_BACKSLASH_Z
:
3164 // These ops may follow the JMP_SAV without preventing us from
3165 // doing this optimization.
3169 // Got a trailing JMP_SAV that's a candidate for optimization.
3173 // This optimization not possible.
3176 break; // from the for loop.
3179 // We found in URX_JMP_SAV near the end that is a candidate for optimizing.
3180 // Is the target address the previous instruction?
3181 // Is the previous instruction a flavor of URX_DOTANY
3182 int32_t loopTopLoc
= URX_VAL(op
);
3183 if (loopTopLoc
!= jmpLoc
-1) {
3187 int32_t oldOp
= fRXPat
->fCompiledPat
->elementAti(loopTopLoc
);
3188 int32_t oldOpType
= opType
= URX_TYPE(oldOp
);
3189 if (oldOpType
== URX_DOTANY
) {
3190 newOp
= URX_BUILD(URX_DOTANY_PL
, 0);
3192 else if (oldOpType
== URX_DOTANY_ALL
) {
3193 newOp
= URX_BUILD(URX_DOTANY_ALL_PL
, 0);
3195 return; // Sequence we were looking for isn't there.
3198 // Substitute the new instructions into the pattern.
3199 // The NOP will be removed in a later optimization step.
3200 fRXPat
->fCompiledPat
->setElementAt(URX_BUILD(URX_NOP
, 0), loopTopLoc
);
3201 fRXPat
->fCompiledPat
->setElementAt(newOp
, jmpLoc
);
3205 //----------------------------------------------------------------------------------------
3207 // Error Report a rule parse error.
3208 // Only report it if no previous error has been recorded.
3210 //----------------------------------------------------------------------------------------
3211 void RegexCompile::error(UErrorCode e
) {
3212 if (U_SUCCESS(*fStatus
)) {
3214 fParseErr
->line
= fLineNum
;
3215 fParseErr
->offset
= fCharNum
;
3217 // Fill in the context.
3218 // Note: extractBetween() pins supplied indicies to the string bounds.
3219 uprv_memset(fParseErr
->preContext
, 0, sizeof(fParseErr
->preContext
));
3220 uprv_memset(fParseErr
->postContext
, 0, sizeof(fParseErr
->postContext
));
3221 fRXPat
->fPattern
.extractBetween(fScanIndex
-U_PARSE_CONTEXT_LEN
+1, fScanIndex
,
3222 fParseErr
->preContext
, 0);
3223 fRXPat
->fPattern
.extractBetween(fScanIndex
, fScanIndex
+U_PARSE_CONTEXT_LEN
-1,
3224 fParseErr
->postContext
, 0);
3230 // Assorted Unicode character constants.
3231 // Numeric because there is no portable way to enter them as literals.
3234 static const UChar chCR
= 0x0d; // New lines, for terminating comments.
3235 static const UChar chLF
= 0x0a;
3236 static const UChar chNEL
= 0x85; // NEL newline variant
3237 static const UChar chLS
= 0x2028; // Unicode Line Separator
3238 static const UChar chApos
= 0x27; // single quote, for quoted chars.
3239 static const UChar chPound
= 0x23; // '#', introduces a comment.
3240 static const UChar chE
= 0x45; // 'E'
3241 static const UChar chBackSlash
= 0x5c; // '\' introduces a char escape
3242 static const UChar chLParen
= 0x28;
3243 static const UChar chRParen
= 0x29;
3244 static const UChar chLBracket
= 0x5b;
3245 static const UChar chRBracket
= 0x5d;
3246 static const UChar chRBrace
= 0x7d;
3247 static const UChar chUpperN
= 0x4E;
3248 static const UChar chLowerP
= 0x70;
3249 static const UChar chUpperP
= 0x50;
3252 //----------------------------------------------------------------------------------------
3254 // nextCharLL Low Level Next Char from the regex pattern.
3255 // Get a char from the string, keep track of input position
3256 // for error reporting.
3258 //----------------------------------------------------------------------------------------
3259 UChar32
RegexCompile::nextCharLL() {
3261 UnicodeString
&pattern
= fRXPat
->fPattern
;
3263 if (fPeekChar
!= -1) {
3268 if (fPatternLength
==0 || fNextIndex
>= fPatternLength
) {
3271 ch
= pattern
.char32At(fNextIndex
);
3272 fNextIndex
= pattern
.moveIndex32(fNextIndex
, 1);
3277 ch
== chLF
&& fLastChar
!= chCR
) {
3278 // Character is starting a new line. Bump up the line number, and
3279 // reset the column to 0.
3283 error(U_REGEX_RULE_SYNTAX
);
3288 // Character is not starting a new line. Except in the case of a
3289 // LF following a CR, increment the column position.
3298 //---------------------------------------------------------------------------------
3300 // peekCharLL Low Level Character Scanning, sneak a peek at the next
3301 // character without actually getting it.
3303 //---------------------------------------------------------------------------------
3304 UChar32
RegexCompile::peekCharLL() {
3305 if (fPeekChar
== -1) {
3306 fPeekChar
= nextCharLL();
3312 //---------------------------------------------------------------------------------
3314 // nextChar for pattern scanning. At this level, we handle stripping
3315 // out comments and processing some backslash character escapes.
3316 // The rest of the pattern grammar is handled at the next level up.
3318 //---------------------------------------------------------------------------------
3319 void RegexCompile::nextChar(RegexPatternChar
&c
) {
3321 fScanIndex
= fNextIndex
;
3322 c
.fChar
= nextCharLL();
3327 if ((c
.fChar
==chBackSlash
&& peekCharLL()==chE
) || c
.fChar
== (UChar32
)-1) {
3328 fQuoteMode
= FALSE
; // Exit quote mode,
3329 nextCharLL(); // discard the E
3330 nextChar(c
); // recurse to get the real next char
3333 else if (fInBackslashQuote
) {
3334 // The current character immediately follows a '\'
3335 // Don't check for any further escapes, just return it as-is.
3336 // Don't set c.fQuoted, because that would prevent the state machine from
3337 // dispatching on the character.
3338 fInBackslashQuote
= FALSE
;
3342 // We are not in a \Q quoted region \E of the source.
3344 if (fModeFlags
& UREGEX_COMMENTS
) {
3346 // We are in free-spacing and comments mode.
3347 // Scan through any white space and comments, until we
3348 // reach a significant character or the end of inut.
3350 if (c
.fChar
== (UChar32
)-1) {
3351 break; // End of Input
3353 if (c
.fChar
== chPound
&& fEOLComments
== TRUE
) {
3354 // Start of a comment. Consume the rest of it, until EOF or a new line
3356 c
.fChar
= nextCharLL();
3357 if (c
.fChar
== (UChar32
)-1 || // EOF
3366 if (uprv_isRuleWhiteSpace(c
.fChar
) == FALSE
) {
3369 c
.fChar
= nextCharLL();
3374 // check for backslash escaped characters.
3376 int32_t startX
= fNextIndex
; // start and end positions of the
3377 int32_t endX
= fNextIndex
; // sequence following the '\'
3378 if (c
.fChar
== chBackSlash
) {
3379 if (RegexStaticSets::gStaticSets
->fUnescapeCharSet
->contains(peekCharLL())) {
3381 // A '\' sequence that is handled by ICU's standard unescapeAt function.
3382 // Includes \uxxxx, \n, \r, many others.
3383 // Return the single equivalent character.
3385 nextCharLL(); // get & discard the peeked char.
3387 c
.fChar
= fRXPat
->fPattern
.unescapeAt(endX
);
3388 if (startX
== endX
) {
3389 error(U_REGEX_BAD_ESCAPE_SEQUENCE
);
3391 fCharNum
+= endX
- startX
;
3396 // We are in a '\' escape that will be handled by the state table scanner.
3397 // Just return the backslash, but remember that the following char is to
3398 // be taken literally. TODO: this is awkward, think about alternatives.
3399 fInBackslashQuote
= TRUE
;
3404 // re-enable # to end-of-line comments, in case they were disabled.
3405 // They are disabled by the parser upon seeing '(?', but this lasts for
3406 // the fetching of the next character only.
3407 fEOLComments
= TRUE
;
3409 // putc(c.fChar, stdout);
3414 //---------------------------------------------------------------------------------
3416 // scanSet Construct a UnicodeSet from the text at the current scan
3417 // position. Advance the scan position to the first character
3420 // The scan position is normally under the control of the state machine
3421 // that controls pattern parsing. UnicodeSets, however, are parsed by
3422 // the UnicodeSet constructor, not by the Regex pattern parser.
3424 //---------------------------------------------------------------------------------
3425 UnicodeSet
*RegexCompile::scanSet() {
3426 UnicodeSet
*uset
= NULL
;
3431 if (U_FAILURE(*fStatus
)) {
3435 pos
.setIndex(fScanIndex
);
3436 startPos
= fScanIndex
;
3437 UErrorCode localStatus
= U_ZERO_ERROR
;
3438 uint32_t usetFlags
= 0;
3439 if (fModeFlags
& UREGEX_CASE_INSENSITIVE
) {
3440 usetFlags
|= USET_CASE_INSENSITIVE
;
3442 if (fModeFlags
& UREGEX_COMMENTS
) {
3443 usetFlags
|= USET_IGNORE_SPACE
;
3446 uset
= new UnicodeSet(fRXPat
->fPattern
, pos
,
3447 usetFlags
, NULL
, localStatus
);
3448 if (U_FAILURE(localStatus
)) {
3449 // TODO: Get more accurate position of the error from UnicodeSet's return info.
3450 // UnicodeSet appears to not be reporting correctly at this time.
3451 REGEX_SCAN_DEBUG_PRINTF(("UnicodeSet parse postion.ErrorIndex = %d\n", pos
.getIndex()));
3457 // Advance the current scan postion over the UnicodeSet.
3458 // Don't just set fScanIndex because the line/char positions maintained
3459 // for error reporting would be thrown off.
3462 if (fNextIndex
>= i
) {
3472 //---------------------------------------------------------------------------------
3474 // scanProp Construct a UnicodeSet from the text at the current scan
3475 // position, which will be of the form \p{whaterver}
3477 // The scan position will be at the 'p' or 'P'. On return
3478 // the scan position should be just after the '}'
3480 // Return a UnicodeSet, constructed from the \P pattern,
3481 // or NULL if the pattern is invalid.
3483 //---------------------------------------------------------------------------------
3484 UnicodeSet
*RegexCompile::scanProp() {
3485 UnicodeSet
*uset
= NULL
;
3487 if (U_FAILURE(*fStatus
)) {
3491 U_ASSERT(fC
.fChar
== chLowerP
|| fC
.fChar
== chUpperP
|| fC
.fChar
== chUpperN
);
3493 // enclose the \p{property} from the regex pattern source in [brackets]
3494 UnicodeString setPattern
;
3495 setPattern
.append(chLBracket
);
3496 setPattern
.append(chBackSlash
);
3498 setPattern
.append(fC
.fChar
);
3499 if (fC
.fChar
== chRBrace
) {
3503 if (fC
.fChar
== -1) {
3504 // Hit the end of the input string without finding the closing '}'
3505 error(U_REGEX_PROPERTY_SYNTAX
);
3509 setPattern
.append(chRBracket
);
3511 uint32_t usetFlags
= 0;
3512 if (fModeFlags
& UREGEX_CASE_INSENSITIVE
) {
3513 usetFlags
|= USET_CASE_INSENSITIVE
;
3515 if (fModeFlags
& UREGEX_COMMENTS
) {
3516 usetFlags
|= USET_IGNORE_SPACE
;
3519 // Build the UnicodeSet from the set pattern we just built up in a string.
3520 uset
= new UnicodeSet(setPattern
, usetFlags
, NULL
, *fStatus
);
3521 if (U_FAILURE(*fStatus
)) {
3526 nextChar(fC
); // Continue overall regex pattern processing with char after the '}'
3531 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS