1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
6 // Copyright (C) 2002-2016 International Business Machines Corporation and others.
7 // All Rights Reserved.
9 // This file contains the ICU regular expression compiler, which is responsible
10 // for processing a regular expression pattern into the compiled form that
11 // is used by the match finding engine.
14 #include "unicode/utypes.h"
16 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
18 #include "unicode/ustring.h"
19 #include "unicode/unistr.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/uchriter.h"
23 #include "unicode/parsepos.h"
24 #include "unicode/parseerr.h"
25 #include "unicode/regex.h"
26 #include "unicode/utf.h"
27 #include "unicode/utf16.h"
28 #include "patternprops.h"
39 #include "regexcst.h" // Contains state table for the regex pattern parser.
40 // generated by a Perl script.
50 //------------------------------------------------------------------------------
54 //------------------------------------------------------------------------------
55 RegexCompile::RegexCompile(RegexPattern
*rxp
, UErrorCode
&status
) :
56 fParenStack(status
), fSetStack(status
), fSetOpStack(status
)
58 // Lazy init of all shared global sets (needed for init()'s empty text)
59 RegexStaticSets::initGlobals(&status
);
70 fInBackslashQuote
= FALSE
;
71 fModeFlags
= fRXPat
->fFlags
| 0x80000000;
75 fMatchCloseParen
= -1;
77 fLastSetLiteral
= U_SENTINEL
;
79 if (U_SUCCESS(status
) && U_FAILURE(rxp
->fDeferredStatus
)) {
80 status
= rxp
->fDeferredStatus
;
84 static const UChar chAmp
= 0x26; // '&'
85 static const UChar chDash
= 0x2d; // '-'
88 //------------------------------------------------------------------------------
92 //------------------------------------------------------------------------------
93 RegexCompile::~RegexCompile() {
94 delete fCaptureName
; // Normally will be NULL, but can exist if pattern
95 // compilation stops with a syntax error.
98 static inline void addCategory(UnicodeSet
*set
, int32_t value
, UErrorCode
& ec
) {
99 set
->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK
, value
, ec
));
102 //------------------------------------------------------------------------------
104 // Compile regex pattern. The state machine for rexexp pattern parsing is here.
105 // The state tables are hand-written in the file regexcst.txt,
106 // and converted to the form used here by a perl
107 // script regexcst.pl
109 //------------------------------------------------------------------------------
110 void RegexCompile::compile(
111 const UnicodeString
&pat
, // Source pat to be compiled.
112 UParseError
&pp
, // Error position info
113 UErrorCode
&e
) // Error Code
115 fRXPat
->fPatternString
= new UnicodeString(pat
);
116 UText patternText
= UTEXT_INITIALIZER
;
117 utext_openConstUnicodeString(&patternText
, fRXPat
->fPatternString
, &e
);
120 compile(&patternText
, pp
, e
);
121 utext_close(&patternText
);
126 // compile, UText mode
127 // All the work is actually done here.
129 void RegexCompile::compile(
130 UText
*pat
, // Source pat to be compiled.
131 UParseError
&pp
, // Error position info
132 UErrorCode
&e
) // Error Code
137 fStack
[fStackPtr
] = 0;
139 if (U_FAILURE(*fStatus
)) {
143 // There should be no pattern stuff in the RegexPattern object. They can not be reused.
144 U_ASSERT(fRXPat
->fPattern
== NULL
|| utext_nativeLength(fRXPat
->fPattern
) == 0);
146 // Prepare the RegexPattern object to receive the compiled pattern.
147 fRXPat
->fPattern
= utext_clone(fRXPat
->fPattern
, pat
, FALSE
, TRUE
, fStatus
);
148 if (U_FAILURE(*fStatus
)) {
151 fRXPat
->fStaticSets
= RegexStaticSets::gStaticSets
->fPropSets
;
152 fRXPat
->fStaticSets8
= RegexStaticSets::gStaticSets
->fPropSets8
;
155 // Initialize the pattern scanning state machine
156 fPatternLength
= utext_nativeLength(pat
);
158 const RegexTableEl
*tableEl
;
160 // UREGEX_LITERAL force entire pattern to be treated as a literal string.
161 if (fModeFlags
& UREGEX_LITERAL
) {
165 nextChar(fC
); // Fetch the first char from the pattern string.
168 // Main loop for the regex pattern parsing state machine.
169 // Runs once per state transition.
170 // Each time through optionally performs, depending on the state table,
171 // - an advance to the the next pattern char
172 // - an action to be performed.
173 // - pushing or popping a state to/from the local state return stack.
174 // file regexcst.txt is the source for the state table. The logic behind
175 // recongizing the pattern syntax is there, not here.
178 // Bail out if anything has gone wrong.
179 // Regex pattern parsing stops on the first error encountered.
180 if (U_FAILURE(*fStatus
)) {
184 U_ASSERT(state
!= 0);
186 // Find the state table element that matches the input char from the pattern, or the
187 // class of the input character. Start with the first table row for this
188 // state, then linearly scan forward until we find a row that matches the
189 // character. The last row for each state always matches all characters, so
190 // the search will stop there, if not before.
192 tableEl
= &gRuleParseStateTable
[state
];
193 REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d) state=%s ",
194 fC
.fChar
, fLineNum
, fCharNum
, RegexStateNames
[state
]));
196 for (;;) { // loop through table rows belonging to this state, looking for one
197 // that matches the current input char.
198 REGEX_SCAN_DEBUG_PRINTF(("."));
199 if (tableEl
->fCharClass
< 127 && fC
.fQuoted
== FALSE
&& tableEl
->fCharClass
== fC
.fChar
) {
200 // Table row specified an individual character, not a set, and
201 // the input character is not quoted, and
202 // the input character matched it.
205 if (tableEl
->fCharClass
== 255) {
206 // Table row specified default, match anything character class.
209 if (tableEl
->fCharClass
== 254 && fC
.fQuoted
) {
210 // Table row specified "quoted" and the char was quoted.
213 if (tableEl
->fCharClass
== 253 && fC
.fChar
== (UChar32
)-1) {
214 // Table row specified eof and we hit eof on the input.
218 if (tableEl
->fCharClass
>= 128 && tableEl
->fCharClass
< 240 && // Table specs a char class &&
219 fC
.fQuoted
== FALSE
&& // char is not escaped &&
220 fC
.fChar
!= (UChar32
)-1) { // char is not EOF
221 U_ASSERT(tableEl
->fCharClass
<= 137);
222 if (RegexStaticSets::gStaticSets
->fRuleSets
[tableEl
->fCharClass
-128].contains(fC
.fChar
)) {
223 // Table row specified a character class, or set of characters,
224 // and the current char matches it.
229 // No match on this row, advance to the next row for this state,
232 REGEX_SCAN_DEBUG_PRINTF(("\n"));
235 // We've found the row of the state table that matches the current input
236 // character from the rules string.
237 // Perform any action specified by this row in the state table.
238 if (doParseActions(tableEl
->fAction
) == FALSE
) {
239 // Break out of the state machine loop if the
240 // the action signalled some kind of error, or
241 // the action was to exit, occurs on normal end-of-rules-input.
245 if (tableEl
->fPushState
!= 0) {
247 if (fStackPtr
>= kStackSize
) {
248 error(U_REGEX_INTERNAL_ERROR
);
249 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
252 fStack
[fStackPtr
] = tableEl
->fPushState
;
256 // NextChar. This is where characters are actually fetched from the pattern.
257 // Happens under control of the 'n' tag in the state table.
259 if (tableEl
->fNextChar
) {
263 // Get the next state from the table entry, or from the
264 // state stack if the next state was specified as "pop".
265 if (tableEl
->fNextState
!= 255) {
266 state
= tableEl
->fNextState
;
268 state
= fStack
[fStackPtr
];
271 // state stack underflow
272 // This will occur if the user pattern has mis-matched parentheses,
273 // with extra close parens.
276 error(U_REGEX_MISMATCHED_PAREN
);
282 if (U_FAILURE(*fStatus
)) {
283 // Bail out if the pattern had errors.
284 // Set stack cleanup: a successful compile would have left it empty,
285 // but errors can leave temporary sets hanging around.
286 while (!fSetStack
.empty()) {
287 delete (UnicodeSet
*)fSetStack
.pop();
293 // The pattern has now been read and processed, and the compiled code generated.
297 // The pattern's fFrameSize so far has accumulated the requirements for
298 // storage for capture parentheses, counters, etc. that are encountered
299 // in the pattern. Add space for the two variables that are always
300 // present in the saved state: the input string position (int64_t) and
301 // the position in the compiled pattern.
303 allocateStackData(RESTACKFRAME_HDRCOUNT
);
306 // Optimization pass 1: NOPs, back-references, and case-folding
311 // Get bounds for the minimum and maximum length of a string that this
312 // pattern can match. Used to avoid looking for matches in strings that
315 fRXPat
->fMinMatchLen
= minMatchLength(3, fRXPat
->fCompiledPat
->size()-1);
318 // Optimization pass 2: match start type
323 // Set up fast latin-1 range sets
325 int32_t numSets
= fRXPat
->fSets
->size();
326 fRXPat
->fSets8
= new Regex8BitSet
[numSets
];
327 // Null pointer check.
328 if (fRXPat
->fSets8
== NULL
) {
329 e
= *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
333 for (i
=0; i
<numSets
; i
++) {
334 UnicodeSet
*s
= (UnicodeSet
*)fRXPat
->fSets
->elementAt(i
);
335 fRXPat
->fSets8
[i
].init(s
);
344 //------------------------------------------------------------------------------
346 // doParseAction Do some action during regex pattern parsing.
347 // Called by the parse state machine.
349 // Generation of the match engine PCode happens here, or
350 // in functions called from the parse actions defined here.
353 //------------------------------------------------------------------------------
354 UBool
RegexCompile::doParseActions(int32_t action
)
356 UBool returnVal
= TRUE
;
358 switch ((Regex_PatternParseAction
)action
) {
361 // Start of pattern compiles to:
362 //0 SAVE 2 Fall back to position of FAIL
364 //2 FAIL Stop if we ever reach here.
365 //3 NOP Dummy, so start of pattern looks the same as
366 // the start of an ( grouping.
367 //4 NOP Resreved, will be replaced by a save if there are
368 // OR | operators at the top level
369 appendOp(URX_STATE_SAVE
, 2);
370 appendOp(URX_JMP
, 3);
371 appendOp(URX_FAIL
, 0);
373 // Standard open nonCapture paren action emits the two NOPs and
374 // sets up the paren stack frame.
375 doParseActions(doOpenNonCaptureParen
);
379 // We've scanned to the end of the pattern
380 // The end of pattern compiles to:
382 // which will stop the runtime match engine.
383 // Encountering end of pattern also behaves like a close paren,
384 // and forces fixups of the State Save at the beginning of the compiled pattern
385 // and of any OR operations at the top level.
388 if (fParenStack
.size() > 0) {
389 // Missing close paren in pattern.
390 error(U_REGEX_MISMATCHED_PAREN
);
393 // add the END operation to the compiled pattern.
394 appendOp(URX_END
, 0);
396 // Terminate the pattern compilation state machine.
403 // Scanning a '|', as in (A|B)
405 // Generate code for any pending literals preceding the '|'
408 // Insert a SAVE operation at the start of the pattern section preceding
409 // this OR at this level. This SAVE will branch the match forward
410 // to the right hand side of the OR in the event that the left hand
411 // side fails to match and backtracks. Locate the position for the
412 // save from the location on the top of the parentheses stack.
413 int32_t savePosition
= fParenStack
.popi();
414 int32_t op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(savePosition
);
415 U_ASSERT(URX_TYPE(op
) == URX_NOP
); // original contents of reserved location
416 op
= buildOp(URX_STATE_SAVE
, fRXPat
->fCompiledPat
->size()+1);
417 fRXPat
->fCompiledPat
->setElementAt(op
, savePosition
);
419 // Append an JMP operation into the compiled pattern. The operand for
420 // the JMP will eventually be the location following the ')' for the
421 // group. This will be patched in later, when the ')' is encountered.
422 appendOp(URX_JMP
, 0);
424 // Push the position of the newly added JMP op onto the parentheses stack.
425 // This registers if for fixup when this block's close paren is encountered.
426 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
);
428 // Append a NOP to the compiled pattern. This is the slot reserved
429 // for a SAVE in the event that there is yet another '|' following
431 appendOp(URX_NOP
, 0);
432 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
);
437 case doBeginNamedCapture
:
438 // Scanning (?<letter.
439 // The first letter of the name will come through again under doConinueNamedCapture.
440 fCaptureName
= new UnicodeString();
441 if (fCaptureName
== NULL
) {
442 error(U_MEMORY_ALLOCATION_ERROR
);
446 case doContinueNamedCapture
:
447 fCaptureName
->append(fC
.fChar
);
450 case doBadNamedCapture
:
451 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
454 case doOpenCaptureParen
:
455 // Open Capturing Paren, possibly named.
457 // - NOP, which later may be replaced by a save-state if the
458 // parenthesized group gets a * quantifier, followed by
459 // - START_CAPTURE n where n is stack frame offset to the capture group variables.
460 // - NOP, which may later be replaced by a save-state if there
461 // is an '|' alternation within the parens.
463 // Each capture group gets three slots in the save stack frame:
464 // 0: Capture Group start position (in input string being matched.)
465 // 1: Capture Group end position.
466 // 2: Start of Match-in-progress.
467 // The first two locations are for a completed capture group, and are
468 // referred to by back references and the like.
469 // The third location stores the capture start position when an START_CAPTURE is
470 // encountered. This will be promoted to a completed capture when (and if) the corresponding
471 // END_CAPTURE is encountered.
474 appendOp(URX_NOP
, 0);
475 int32_t varsLoc
= allocateStackData(3); // Reserve three slots in match stack frame.
476 appendOp(URX_START_CAPTURE
, varsLoc
);
477 appendOp(URX_NOP
, 0);
479 // On the Parentheses stack, start a new frame and add the postions
480 // of the two NOPs. Depending on what follows in the pattern, the
481 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
482 // address of the end of the parenthesized group.
483 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
484 fParenStack
.push(capturing
, *fStatus
); // Frame type.
485 fParenStack
.push(fRXPat
->fCompiledPat
->size()-3, *fStatus
); // The first NOP location
486 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP loc
488 // Save the mapping from group number to stack frame variable position.
489 fRXPat
->fGroupMap
->addElement(varsLoc
, *fStatus
);
491 // If this is a named capture group, add the name->group number mapping.
492 if (fCaptureName
!= NULL
) {
493 int32_t groupNumber
= fRXPat
->fGroupMap
->size();
494 int32_t previousMapping
= uhash_puti(fRXPat
->fNamedCaptureMap
, fCaptureName
, groupNumber
, fStatus
);
495 fCaptureName
= NULL
; // hash table takes ownership of the name (key) string.
496 if (previousMapping
> 0 && U_SUCCESS(*fStatus
)) {
497 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
503 case doOpenNonCaptureParen
:
504 // Open non-caputuring (grouping only) Paren.
506 // - NOP, which later may be replaced by a save-state if the
507 // parenthesized group gets a * quantifier, followed by
508 // - NOP, which may later be replaced by a save-state if there
509 // is an '|' alternation within the parens.
512 appendOp(URX_NOP
, 0);
513 appendOp(URX_NOP
, 0);
515 // On the Parentheses stack, start a new frame and add the postions
517 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
518 fParenStack
.push(plain
, *fStatus
); // Begin a new frame.
519 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The first NOP location
520 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP loc
525 case doOpenAtomicParen
:
526 // Open Atomic Paren. (?>
528 // - NOP, which later may be replaced if the parenthesized group
529 // has a quantifier, followed by
530 // - STO_SP save state stack position, so it can be restored at the ")"
531 // - NOP, which may later be replaced by a save-state if there
532 // is an '|' alternation within the parens.
535 appendOp(URX_NOP
, 0);
536 int32_t varLoc
= allocateData(1); // Reserve a data location for saving the state stack ptr.
537 appendOp(URX_STO_SP
, varLoc
);
538 appendOp(URX_NOP
, 0);
540 // On the Parentheses stack, start a new frame and add the postions
541 // of the two NOPs. Depending on what follows in the pattern, the
542 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
543 // address of the end of the parenthesized group.
544 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
545 fParenStack
.push(atomic
, *fStatus
); // Frame type.
546 fParenStack
.push(fRXPat
->fCompiledPat
->size()-3, *fStatus
); // The first NOP
547 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP
552 case doOpenLookAhead
:
553 // Positive Look-ahead (?= stuff )
555 // Note: Addition of transparent input regions, with the need to
556 // restore the original regions when failing out of a lookahead
557 // block, complicated this sequence. Some conbined opcodes
558 // might make sense - or might not, lookahead aren't that common.
560 // Caution: min match length optimization knows about this
561 // sequence; don't change without making updates there too.
564 // 1 START_LA dataLoc Saves SP, Input Pos
565 // 2. STATE_SAVE 4 on failure of lookahead, goto 4
566 // 3 JMP 6 continue ...
568 // 4. LA_END Look Ahead failed. Restore regions.
569 // 5. BACKTRACK and back track again.
571 // 6. NOP reserved for use by quantifiers on the block.
572 // Look-ahead can't have quantifiers, but paren stack
573 // compile time conventions require the slot anyhow.
574 // 7. NOP may be replaced if there is are '|' ops in the block.
575 // 8. code for parenthesized stuff.
578 // Two data slots are reserved, for saving the stack ptr and the input position.
581 int32_t dataLoc
= allocateData(2);
582 appendOp(URX_LA_START
, dataLoc
);
583 appendOp(URX_STATE_SAVE
, fRXPat
->fCompiledPat
->size()+ 2);
584 appendOp(URX_JMP
, fRXPat
->fCompiledPat
->size()+ 3);
585 appendOp(URX_LA_END
, dataLoc
);
586 appendOp(URX_BACKTRACK
, 0);
587 appendOp(URX_NOP
, 0);
588 appendOp(URX_NOP
, 0);
590 // On the Parentheses stack, start a new frame and add the postions
592 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
593 fParenStack
.push(lookAhead
, *fStatus
); // Frame type.
594 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The first NOP location
595 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP location
599 case doOpenLookAheadNeg
:
600 // Negated Lookahead. (?! stuff )
602 // 1. START_LA dataloc
603 // 2. SAVE_STATE 7 // Fail within look-ahead block restores to this state,
604 // // which continues with the match.
605 // 3. NOP // Std. Open Paren sequence, for possible '|'
606 // 4. code for parenthesized stuff.
607 // 5. END_LA // Cut back stack, remove saved state from step 2.
608 // 6. BACKTRACK // code in block succeeded, so neg. lookahead fails.
609 // 7. END_LA // Restore match region, in case look-ahead was using
610 // an alternate (transparent) region.
613 int32_t dataLoc
= allocateData(2);
614 appendOp(URX_LA_START
, dataLoc
);
615 appendOp(URX_STATE_SAVE
, 0); // dest address will be patched later.
616 appendOp(URX_NOP
, 0);
618 // On the Parentheses stack, start a new frame and add the postions
619 // of the StateSave and NOP.
620 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
621 fParenStack
.push(negLookAhead
, *fStatus
); // Frame type
622 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The STATE_SAVE location
623 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP location
625 // Instructions #5 - #7 will be added when the ')' is encountered.
629 case doOpenLookBehind
:
631 // Compile a (?<= look-behind open paren.
634 // 0 URX_LB_START dataLoc
635 // 1 URX_LB_CONT dataLoc
638 // 4 URX_NOP Standard '(' boilerplate.
639 // 5 URX_NOP Reserved slot for use with '|' ops within (block).
640 // 6 <code for LookBehind expression>
641 // 7 URX_LB_END dataLoc # Check match len, restore input len
642 // 8 URX_LA_END dataLoc # Restore stack, input pos
644 // Allocate a block of matcher data, to contain (when running a match)
645 // 0: Stack ptr on entry
646 // 1: Input Index on entry
647 // 2: Start index of match current match attempt.
648 // 3: Original Input String len.
650 // Generate match code for any pending literals.
653 // Allocate data space
654 int32_t dataLoc
= allocateData(4);
657 appendOp(URX_LB_START
, dataLoc
);
660 appendOp(URX_LB_CONT
, dataLoc
);
661 appendOp(URX_RESERVED_OP
, 0); // MinMatchLength. To be filled later.
662 appendOp(URX_RESERVED_OP
, 0); // MaxMatchLength. To be filled later.
665 appendOp(URX_NOP
, 0);
666 appendOp(URX_NOP
, 0);
668 // On the Parentheses stack, start a new frame and add the postions
669 // of the URX_LB_CONT and the NOP.
670 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
671 fParenStack
.push(lookBehind
, *fStatus
); // Frame type
672 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The first NOP location
673 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The 2nd NOP location
675 // The final two instructions will be added when the ')' is encountered.
680 case doOpenLookBehindNeg
:
682 // Compile a (?<! negated look-behind open paren.
685 // 0 URX_LB_START dataLoc # Save entry stack, input len
686 // 1 URX_LBN_CONT dataLoc # Iterate possible match positions
690 // 5 URX_NOP Standard '(' boilerplate.
691 // 6 URX_NOP Reserved slot for use with '|' ops within (block).
692 // 7 <code for LookBehind expression>
693 // 8 URX_LBN_END dataLoc # Check match len, cause a FAIL
696 // Allocate a block of matcher data, to contain (when running a match)
697 // 0: Stack ptr on entry
698 // 1: Input Index on entry
699 // 2: Start index of match current match attempt.
700 // 3: Original Input String len.
702 // Generate match code for any pending literals.
705 // Allocate data space
706 int32_t dataLoc
= allocateData(4);
709 appendOp(URX_LB_START
, dataLoc
);
712 appendOp(URX_LBN_CONT
, dataLoc
);
713 appendOp(URX_RESERVED_OP
, 0); // MinMatchLength. To be filled later.
714 appendOp(URX_RESERVED_OP
, 0); // MaxMatchLength. To be filled later.
715 appendOp(URX_RESERVED_OP
, 0); // Continue Loc. To be filled later.
718 appendOp(URX_NOP
, 0);
719 appendOp(URX_NOP
, 0);
721 // On the Parentheses stack, start a new frame and add the postions
722 // of the URX_LB_CONT and the NOP.
723 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
724 fParenStack
.push(lookBehindN
, *fStatus
); // Frame type
725 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The first NOP location
726 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The 2nd NOP location
728 // The final two instructions will be added when the ')' is encountered.
732 case doConditionalExpr
:
733 // Conditionals such as (?(1)a:b)
735 // Perl inline-condtionals. (?{perl code}a|b) We're not perl, no way to do them.
736 error(U_REGEX_UNIMPLEMENTED
);
742 if (fParenStack
.size() <= 0) {
743 // Extra close paren, or missing open paren.
744 error(U_REGEX_MISMATCHED_PAREN
);
752 case doBadOpenParenType
:
754 error(U_REGEX_RULE_SYNTAX
);
758 case doMismatchedParenErr
:
759 error(U_REGEX_MISMATCHED_PAREN
);
763 // Normal '+' compiles to
764 // 1. stuff to be repeated (already built)
768 // Or, if the item to be repeated can match a zero length string,
769 // 1. STO_INP_LOC data-loc
770 // 2. body of stuff to be repeated
775 // Or, if the item to be repeated is simple
776 // 1. Item to be repeated.
777 // 2. LOOP_SR_I set number (assuming repeated item is a set ref)
778 // 3. LOOP_C stack location
780 int32_t topLoc
= blockTopLoc(FALSE
); // location of item #1
783 // Check for simple constructs, which may get special optimized code.
784 if (topLoc
== fRXPat
->fCompiledPat
->size() - 1) {
785 int32_t repeatedOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(topLoc
);
787 if (URX_TYPE(repeatedOp
) == URX_SETREF
) {
788 // Emit optimized code for [char set]+
789 appendOp(URX_LOOP_SR_I
, URX_VAL(repeatedOp
));
790 frameLoc
= allocateStackData(1);
791 appendOp(URX_LOOP_C
, frameLoc
);
795 if (URX_TYPE(repeatedOp
) == URX_DOTANY
||
796 URX_TYPE(repeatedOp
) == URX_DOTANY_ALL
||
797 URX_TYPE(repeatedOp
) == URX_DOTANY_UNIX
) {
798 // Emit Optimized code for .+ operations.
799 int32_t loopOpI
= buildOp(URX_LOOP_DOT_I
, 0);
800 if (URX_TYPE(repeatedOp
) == URX_DOTANY_ALL
) {
801 // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
804 if (fModeFlags
& UREGEX_UNIX_LINES
) {
808 frameLoc
= allocateStackData(1);
809 appendOp(URX_LOOP_C
, frameLoc
);
817 // Check for minimum match length of zero, which requires
818 // extra loop-breaking code.
819 if (minMatchLength(topLoc
, fRXPat
->fCompiledPat
->size()-1) == 0) {
820 // Zero length match is possible.
821 // Emit the code sequence that can handle it.
823 frameLoc
= allocateStackData(1);
825 int32_t op
= buildOp(URX_STO_INP_LOC
, frameLoc
);
826 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
);
828 appendOp(URX_JMP_SAV_X
, topLoc
+1);
830 // Simpler code when the repeated body must match something non-empty
831 appendOp(URX_JMP_SAV
, topLoc
);
837 // Non-greedy '+?' compiles to
838 // 1. stuff to be repeated (already built)
842 int32_t topLoc
= blockTopLoc(FALSE
);
843 appendOp(URX_STATE_SAVE
, topLoc
);
849 // Normal (greedy) ? quantifier.
852 // 2. body of optional block
854 // Insert the state save into the compiled pattern, and we're done.
856 int32_t saveStateLoc
= blockTopLoc(TRUE
);
857 int32_t saveStateOp
= buildOp(URX_STATE_SAVE
, fRXPat
->fCompiledPat
->size());
858 fRXPat
->fCompiledPat
->setElementAt(saveStateOp
, saveStateLoc
);
863 // Non-greedy ?? quantifier
866 // 2. body of optional block
870 // This code is less than ideal, with two jmps instead of one, because we can only
871 // insert one instruction at the top of the block being iterated.
873 int32_t jmp1_loc
= blockTopLoc(TRUE
);
874 int32_t jmp2_loc
= fRXPat
->fCompiledPat
->size();
876 int32_t jmp1_op
= buildOp(URX_JMP
, jmp2_loc
+1);
877 fRXPat
->fCompiledPat
->setElementAt(jmp1_op
, jmp1_loc
);
879 appendOp(URX_JMP
, jmp2_loc
+2);
881 appendOp(URX_STATE_SAVE
, jmp1_loc
+1);
887 // Normal (greedy) * quantifier.
890 // 2. body of stuff being iterated over
894 // Or, if the body is a simple [Set],
895 // 1. LOOP_SR_I set number
896 // 2. LOOP_C stack location
899 // Or if this is a .*
900 // 1. LOOP_DOT_I (. matches all mode flag)
901 // 2. LOOP_C stack location
903 // Or, if the body can match a zero-length string, to inhibit infinite loops,
905 // 2. STO_INP_LOC data-loc
910 // location of item #1, the STATE_SAVE
911 int32_t topLoc
= blockTopLoc(FALSE
);
912 int32_t dataLoc
= -1;
914 // Check for simple *, where the construct being repeated
915 // compiled to single opcode, and might be optimizable.
916 if (topLoc
== fRXPat
->fCompiledPat
->size() - 1) {
917 int32_t repeatedOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(topLoc
);
919 if (URX_TYPE(repeatedOp
) == URX_SETREF
) {
920 // Emit optimized code for a [char set]*
921 int32_t loopOpI
= buildOp(URX_LOOP_SR_I
, URX_VAL(repeatedOp
));
922 fRXPat
->fCompiledPat
->setElementAt(loopOpI
, topLoc
);
923 dataLoc
= allocateStackData(1);
924 appendOp(URX_LOOP_C
, dataLoc
);
928 if (URX_TYPE(repeatedOp
) == URX_DOTANY
||
929 URX_TYPE(repeatedOp
) == URX_DOTANY_ALL
||
930 URX_TYPE(repeatedOp
) == URX_DOTANY_UNIX
) {
931 // Emit Optimized code for .* operations.
932 int32_t loopOpI
= buildOp(URX_LOOP_DOT_I
, 0);
933 if (URX_TYPE(repeatedOp
) == URX_DOTANY_ALL
) {
934 // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
937 if ((fModeFlags
& UREGEX_UNIX_LINES
) != 0) {
940 fRXPat
->fCompiledPat
->setElementAt(loopOpI
, topLoc
);
941 dataLoc
= allocateStackData(1);
942 appendOp(URX_LOOP_C
, dataLoc
);
947 // Emit general case code for this *
948 // The optimizations did not apply.
950 int32_t saveStateLoc
= blockTopLoc(TRUE
);
951 int32_t jmpOp
= buildOp(URX_JMP_SAV
, saveStateLoc
+1);
953 // Check for minimum match length of zero, which requires
954 // extra loop-breaking code.
955 if (minMatchLength(saveStateLoc
, fRXPat
->fCompiledPat
->size()-1) == 0) {
956 insertOp(saveStateLoc
);
957 dataLoc
= allocateStackData(1);
959 int32_t op
= buildOp(URX_STO_INP_LOC
, dataLoc
);
960 fRXPat
->fCompiledPat
->setElementAt(op
, saveStateLoc
+1);
961 jmpOp
= buildOp(URX_JMP_SAV_X
, saveStateLoc
+2);
964 // Locate the position in the compiled pattern where the match will continue
965 // after completing the *. (4 or 5 in the comment above)
966 int32_t continueLoc
= fRXPat
->fCompiledPat
->size()+1;
968 // Put together the save state op and store it into the compiled code.
969 int32_t saveStateOp
= buildOp(URX_STATE_SAVE
, continueLoc
);
970 fRXPat
->fCompiledPat
->setElementAt(saveStateOp
, saveStateLoc
);
972 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
978 // Non-greedy *? quantifier
981 // 2. body of stuff being iterated over
985 int32_t jmpLoc
= blockTopLoc(TRUE
); // loc 1.
986 int32_t saveLoc
= fRXPat
->fCompiledPat
->size(); // loc 3.
987 int32_t jmpOp
= buildOp(URX_JMP
, saveLoc
);
988 fRXPat
->fCompiledPat
->setElementAt(jmpOp
, jmpLoc
);
989 appendOp(URX_STATE_SAVE
, jmpLoc
+1);
995 // The '{' opening an interval quantifier was just scanned.
996 // Init the counter varaiables that will accumulate the values as the digits
1002 case doIntevalLowerDigit
:
1003 // Scanned a digit from the lower value of an {lower,upper} interval
1005 int32_t digitValue
= u_charDigitValue(fC
.fChar
);
1006 U_ASSERT(digitValue
>= 0);
1007 int64_t val
= (int64_t)fIntervalLow
*10 + digitValue
;
1008 if (val
> INT32_MAX
) {
1009 error(U_REGEX_NUMBER_TOO_BIG
);
1011 fIntervalLow
= (int32_t)val
;
1016 case doIntervalUpperDigit
:
1017 // Scanned a digit from the upper value of an {lower,upper} interval
1019 if (fIntervalUpper
< 0) {
1022 int32_t digitValue
= u_charDigitValue(fC
.fChar
);
1023 U_ASSERT(digitValue
>= 0);
1024 int64_t val
= (int64_t)fIntervalUpper
*10 + digitValue
;
1025 if (val
> INT32_MAX
) {
1026 error(U_REGEX_NUMBER_TOO_BIG
);
1028 fIntervalUpper
= (int32_t)val
;
1033 case doIntervalSame
:
1034 // Scanned a single value interval like {27}. Upper = Lower.
1035 fIntervalUpper
= fIntervalLow
;
1039 // Finished scanning a normal {lower,upper} interval. Generate the code for it.
1040 if (compileInlineInterval() == FALSE
) {
1041 compileInterval(URX_CTR_INIT
, URX_CTR_LOOP
);
1045 case doPossessiveInterval
:
1046 // Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it.
1048 // Remember the loc for the top of the block being looped over.
1049 // (Can not reserve a slot in the compiled pattern at this time, because
1050 // compileInterval needs to reserve also, and blockTopLoc can only reserve
1052 int32_t topLoc
= blockTopLoc(FALSE
);
1054 // Produce normal looping code.
1055 compileInterval(URX_CTR_INIT
, URX_CTR_LOOP
);
1057 // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
1058 // just as if the loop was inclosed in atomic parentheses.
1060 // First the STO_SP before the start of the loop
1063 int32_t varLoc
= allocateData(1); // Reserve a data location for saving the
1064 int32_t op
= buildOp(URX_STO_SP
, varLoc
);
1065 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
);
1067 int32_t loopOp
= (int32_t)fRXPat
->fCompiledPat
->popi();
1068 U_ASSERT(URX_TYPE(loopOp
) == URX_CTR_LOOP
&& URX_VAL(loopOp
) == topLoc
);
1069 loopOp
++; // point LoopOp after the just-inserted STO_SP
1070 fRXPat
->fCompiledPat
->push(loopOp
, *fStatus
);
1072 // Then the LD_SP after the end of the loop
1073 appendOp(URX_LD_SP
, varLoc
);
1079 // Finished scanning a non-greedy {lower,upper}? interval. Generate the code for it.
1080 compileInterval(URX_CTR_INIT_NG
, URX_CTR_LOOP_NG
);
1083 case doIntervalError
:
1084 error(U_REGEX_BAD_INTERVAL
);
1088 // We've just scanned a "normal" character from the pattern,
1089 literalChar(fC
.fChar
);
1093 case doEscapedLiteralChar
:
1094 // We've just scanned an backslashed escaped character with no
1095 // special meaning. It represents itself.
1096 if ((fModeFlags
& UREGEX_ERROR_ON_UNKNOWN_ESCAPES
) != 0 &&
1097 ((fC
.fChar
>= 0x41 && fC
.fChar
<= 0x5A) || // in [A-Z]
1098 (fC
.fChar
>= 0x61 && fC
.fChar
<= 0x7a))) { // in [a-z]
1099 error(U_REGEX_BAD_ESCAPE_SEQUENCE
);
1101 literalChar(fC
.fChar
);
1106 // scanned a ".", match any single character.
1109 if (fModeFlags
& UREGEX_DOTALL
) {
1110 appendOp(URX_DOTANY_ALL
, 0);
1111 } else if (fModeFlags
& UREGEX_UNIX_LINES
) {
1112 appendOp(URX_DOTANY_UNIX
, 0);
1114 appendOp(URX_DOTANY
, 0);
1122 if ( (fModeFlags
& UREGEX_MULTILINE
) == 0 && (fModeFlags
& UREGEX_UNIX_LINES
) == 0) {
1123 appendOp(URX_CARET
, 0);
1124 } else if ((fModeFlags
& UREGEX_MULTILINE
) != 0 && (fModeFlags
& UREGEX_UNIX_LINES
) == 0) {
1125 appendOp(URX_CARET_M
, 0);
1126 } else if ((fModeFlags
& UREGEX_MULTILINE
) == 0 && (fModeFlags
& UREGEX_UNIX_LINES
) != 0) {
1127 appendOp(URX_CARET
, 0); // Only testing true start of input.
1128 } else if ((fModeFlags
& UREGEX_MULTILINE
) != 0 && (fModeFlags
& UREGEX_UNIX_LINES
) != 0) {
1129 appendOp(URX_CARET_M_UNIX
, 0);
1137 if ( (fModeFlags
& UREGEX_MULTILINE
) == 0 && (fModeFlags
& UREGEX_UNIX_LINES
) == 0) {
1138 appendOp(URX_DOLLAR
, 0);
1139 } else if ((fModeFlags
& UREGEX_MULTILINE
) != 0 && (fModeFlags
& UREGEX_UNIX_LINES
) == 0) {
1140 appendOp(URX_DOLLAR_M
, 0);
1141 } else if ((fModeFlags
& UREGEX_MULTILINE
) == 0 && (fModeFlags
& UREGEX_UNIX_LINES
) != 0) {
1142 appendOp(URX_DOLLAR_D
, 0);
1143 } else if ((fModeFlags
& UREGEX_MULTILINE
) != 0 && (fModeFlags
& UREGEX_UNIX_LINES
) != 0) {
1144 appendOp(URX_DOLLAR_MD
, 0);
1151 appendOp(URX_CARET
, 0);
1156 #if UCONFIG_NO_BREAK_ITERATION==1
1157 if (fModeFlags
& UREGEX_UWORD
) {
1158 error(U_UNSUPPORTED_ERROR
);
1162 int32_t op
= (fModeFlags
& UREGEX_UWORD
)? URX_BACKSLASH_BU
: URX_BACKSLASH_B
;
1169 #if UCONFIG_NO_BREAK_ITERATION==1
1170 if (fModeFlags
& UREGEX_UWORD
) {
1171 error(U_UNSUPPORTED_ERROR
);
1175 int32_t op
= (fModeFlags
& UREGEX_UWORD
)? URX_BACKSLASH_BU
: URX_BACKSLASH_B
;
1182 appendOp(URX_BACKSLASH_D
, 1);
1187 appendOp(URX_BACKSLASH_D
, 0);
1192 appendOp(URX_BACKSLASH_G
, 0);
1197 appendOp(URX_BACKSLASH_H
, 1);
1202 appendOp(URX_BACKSLASH_H
, 0);
1207 appendOp(URX_BACKSLASH_R
, 0);
1212 appendOp(URX_STAT_SETREF_N
, URX_ISSPACE_SET
);
1217 appendOp(URX_STATIC_SETREF
, URX_ISSPACE_SET
);
1222 appendOp(URX_BACKSLASH_V
, 1);
1227 appendOp(URX_BACKSLASH_V
, 0);
1232 appendOp(URX_STAT_SETREF_N
, URX_ISWORD_SET
);
1237 appendOp(URX_STATIC_SETREF
, URX_ISWORD_SET
);
1242 appendOp(URX_BACKSLASH_X
, 0);
1248 appendOp(URX_DOLLAR
, 0);
1253 appendOp(URX_BACKSLASH_Z
, 0);
1257 error(U_REGEX_BAD_ESCAPE_SEQUENCE
);
1268 UnicodeSet
*theSet
= scanProp();
1275 UChar32 c
= scanNamedChar();
1282 // BackReference. Somewhat unusual in that the front-end can not completely parse
1283 // the regular expression, because the number of digits to be consumed
1284 // depends on the number of capture groups that have been defined. So
1285 // we have to do it here instead.
1287 int32_t numCaptureGroups
= fRXPat
->fGroupMap
->size();
1288 int32_t groupNum
= 0;
1289 UChar32 c
= fC
.fChar
;
1292 // Loop once per digit, for max allowed number of digits in a back reference.
1293 int32_t digit
= u_charDigitValue(c
);
1294 groupNum
= groupNum
* 10 + digit
;
1295 if (groupNum
>= numCaptureGroups
) {
1299 if (RegexStaticSets::gStaticSets
->fRuleDigitsAlias
->contains(c
) == FALSE
) {
1305 // Scan of the back reference in the source regexp is complete. Now generate
1306 // the compiled code for it.
1307 // Because capture groups can be forward-referenced by back-references,
1308 // we fill the operand with the capture group number. At the end
1309 // of compilation, it will be changed to the variable's location.
1310 U_ASSERT(groupNum
> 0); // Shouldn't happen. '\0' begins an octal escape sequence,
1311 // and shouldn't enter this code path at all.
1313 if (fModeFlags
& UREGEX_CASE_INSENSITIVE
) {
1314 appendOp(URX_BACKREF_I
, groupNum
);
1316 appendOp(URX_BACKREF
, groupNum
);
1321 case doBeginNamedBackRef
:
1322 U_ASSERT(fCaptureName
== NULL
);
1323 fCaptureName
= new UnicodeString
;
1324 if (fCaptureName
== NULL
) {
1325 error(U_MEMORY_ALLOCATION_ERROR
);
1329 case doContinueNamedBackRef
:
1330 fCaptureName
->append(fC
.fChar
);
1333 case doCompleteNamedBackRef
:
1335 int32_t groupNumber
= uhash_geti(fRXPat
->fNamedCaptureMap
, fCaptureName
);
1336 if (groupNumber
== 0) {
1337 // Group name has not been defined.
1338 // Could be a forward reference. If we choose to support them at some
1339 // future time, extra mechanism will be required at this point.
1340 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
1342 // Given the number, handle identically to a \n numbered back reference.
1343 // See comments above, under doBackRef
1345 if (fModeFlags
& UREGEX_CASE_INSENSITIVE
) {
1346 appendOp(URX_BACKREF_I
, groupNumber
);
1348 appendOp(URX_BACKREF
, groupNumber
);
1351 delete fCaptureName
;
1352 fCaptureName
= NULL
;
1356 case doPossessivePlus
:
1357 // Possessive ++ quantifier.
1360 // 2. body of stuff being iterated over
1366 // Note: TODO: This is pretty inefficient. A mass of saved state is built up
1367 // then unconditionally discarded. Perhaps introduce a new opcode. Ticket 6056
1371 int32_t topLoc
= blockTopLoc(TRUE
);
1372 int32_t stoLoc
= allocateData(1); // Reserve the data location for storing save stack ptr.
1373 int32_t op
= buildOp(URX_STO_SP
, stoLoc
);
1374 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
);
1376 // Emit the STATE_SAVE
1377 appendOp(URX_STATE_SAVE
, fRXPat
->fCompiledPat
->size()+2);
1380 appendOp(URX_JMP
, topLoc
+1);
1383 appendOp(URX_LD_SP
, stoLoc
);
1387 case doPossessiveStar
:
1388 // Possessive *+ quantifier.
1392 // 3. body of stuff being iterated over
1396 // TODO: do something to cut back the state stack each time through the loop.
1398 // Reserve two slots at the top of the block.
1399 int32_t topLoc
= blockTopLoc(TRUE
);
1403 int32_t stoLoc
= allocateData(1); // Reserve the data location for storing save stack ptr.
1404 int32_t op
= buildOp(URX_STO_SP
, stoLoc
);
1405 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
);
1407 // Emit the SAVE_STATE 5
1408 int32_t L7
= fRXPat
->fCompiledPat
->size()+1;
1409 op
= buildOp(URX_STATE_SAVE
, L7
);
1410 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
+1);
1412 // Append the JMP operation.
1413 appendOp(URX_JMP
, topLoc
+1);
1415 // Emit the LD_SP loc
1416 appendOp(URX_LD_SP
, stoLoc
);
1420 case doPossessiveOpt
:
1421 // Possessive ?+ quantifier.
1425 // 3. body of optional block
1430 // Reserve two slots at the top of the block.
1431 int32_t topLoc
= blockTopLoc(TRUE
);
1435 int32_t stoLoc
= allocateData(1); // Reserve the data location for storing save stack ptr.
1436 int32_t op
= buildOp(URX_STO_SP
, stoLoc
);
1437 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
);
1439 // Emit the SAVE_STATE
1440 int32_t continueLoc
= fRXPat
->fCompiledPat
->size()+1;
1441 op
= buildOp(URX_STATE_SAVE
, continueLoc
);
1442 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
+1);
1445 appendOp(URX_LD_SP
, stoLoc
);
1450 case doBeginMatchMode
:
1451 fNewModeFlags
= fModeFlags
;
1452 fSetModeFlag
= TRUE
;
1455 case doMatchMode
: // (?i) and similar
1459 case 0x69: /* 'i' */ bit
= UREGEX_CASE_INSENSITIVE
; break;
1460 case 0x64: /* 'd' */ bit
= UREGEX_UNIX_LINES
; break;
1461 case 0x6d: /* 'm' */ bit
= UREGEX_MULTILINE
; break;
1462 case 0x73: /* 's' */ bit
= UREGEX_DOTALL
; break;
1463 case 0x75: /* 'u' */ bit
= 0; /* Unicode casing */ break;
1464 case 0x77: /* 'w' */ bit
= UREGEX_UWORD
; break;
1465 case 0x78: /* 'x' */ bit
= UREGEX_COMMENTS
; break;
1466 case 0x2d: /* '-' */ fSetModeFlag
= FALSE
; break;
1468 UPRV_UNREACHABLE
; // Should never happen. Other chars are filtered out
1472 fNewModeFlags
|= bit
;
1474 fNewModeFlags
&= ~bit
;
1479 case doSetMatchMode
:
1480 // Emit code to match any pending literals, using the not-yet changed match mode.
1483 // We've got a (?i) or similar. The match mode is being changed, but
1484 // the change is not scoped to a parenthesized block.
1485 U_ASSERT(fNewModeFlags
< 0);
1486 fModeFlags
= fNewModeFlags
;
1491 case doMatchModeParen
:
1492 // We've got a (?i: or similar. Begin a parenthesized block, save old
1493 // mode flags so they can be restored at the close of the block.
1496 // - NOP, which later may be replaced by a save-state if the
1497 // parenthesized group gets a * quantifier, followed by
1498 // - NOP, which may later be replaced by a save-state if there
1499 // is an '|' alternation within the parens.
1502 appendOp(URX_NOP
, 0);
1503 appendOp(URX_NOP
, 0);
1505 // On the Parentheses stack, start a new frame and add the postions
1506 // of the two NOPs (a normal non-capturing () frame, except for the
1507 // saving of the orignal mode flags.)
1508 fParenStack
.push(fModeFlags
, *fStatus
);
1509 fParenStack
.push(flags
, *fStatus
); // Frame Marker
1510 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The first NOP
1511 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP
1513 // Set the current mode flags to the new values.
1514 U_ASSERT(fNewModeFlags
< 0);
1515 fModeFlags
= fNewModeFlags
;
1520 error(U_REGEX_INVALID_FLAG
);
1523 case doSuppressComments
:
1524 // We have just scanned a '(?'. We now need to prevent the character scanner from
1525 // treating a '#' as a to-the-end-of-line comment.
1526 // (This Perl compatibility just gets uglier and uglier to do...)
1527 fEOLComments
= FALSE
;
1533 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1540 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1545 case doSetBackslash_s
:
1547 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1548 set
->addAll(*RegexStaticSets::gStaticSets
->fPropSets
[URX_ISSPACE_SET
]);
1552 case doSetBackslash_S
:
1554 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1555 UnicodeSet
SSet(*RegexStaticSets::gStaticSets
->fPropSets
[URX_ISSPACE_SET
]);
1561 case doSetBackslash_d
:
1563 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1564 // TODO - make a static set, ticket 6058.
1565 addCategory(set
, U_GC_ND_MASK
, *fStatus
);
1569 case doSetBackslash_D
:
1571 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1573 // TODO - make a static set, ticket 6058.
1574 digits
.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK
, U_GC_ND_MASK
, *fStatus
);
1575 digits
.complement();
1576 set
->addAll(digits
);
1580 case doSetBackslash_h
:
1582 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1584 h
.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK
, U_GC_ZS_MASK
, *fStatus
);
1585 h
.add((UChar32
)9); // Tab
1590 case doSetBackslash_H
:
1592 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1594 h
.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK
, U_GC_ZS_MASK
, *fStatus
);
1595 h
.add((UChar32
)9); // Tab
1601 case doSetBackslash_v
:
1603 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1604 set
->add((UChar32
)0x0a, (UChar32
)0x0d); // add range
1605 set
->add((UChar32
)0x85);
1606 set
->add((UChar32
)0x2028, (UChar32
)0x2029);
1610 case doSetBackslash_V
:
1612 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1614 v
.add((UChar32
)0x0a, (UChar32
)0x0d); // add range
1615 v
.add((UChar32
)0x85);
1616 v
.add((UChar32
)0x2028, (UChar32
)0x2029);
1622 case doSetBackslash_w
:
1624 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1625 set
->addAll(*RegexStaticSets::gStaticSets
->fPropSets
[URX_ISWORD_SET
]);
1629 case doSetBackslash_W
:
1631 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1632 UnicodeSet
SSet(*RegexStaticSets::gStaticSets
->fPropSets
[URX_ISWORD_SET
]);
1640 fSetStack
.push(new UnicodeSet(), *fStatus
);
1641 fSetOpStack
.push(setStart
, *fStatus
);
1642 if ((fModeFlags
& UREGEX_CASE_INSENSITIVE
) != 0) {
1643 fSetOpStack
.push(setCaseClose
, *fStatus
);
1647 case doSetBeginDifference1
:
1648 // We have scanned something like [[abc]-[
1649 // Set up a new UnicodeSet for the set beginning with the just-scanned '['
1650 // Push a Difference operator, which will cause the new set to be subtracted from what
1651 // went before once it is created.
1652 setPushOp(setDifference1
);
1653 fSetOpStack
.push(setStart
, *fStatus
);
1654 if ((fModeFlags
& UREGEX_CASE_INSENSITIVE
) != 0) {
1655 fSetOpStack
.push(setCaseClose
, *fStatus
);
1659 case doSetBeginIntersection1
:
1660 // We have scanned something like [[abc]&[
1661 // Need both the '&' operator and the open '[' operator.
1662 setPushOp(setIntersection1
);
1663 fSetOpStack
.push(setStart
, *fStatus
);
1664 if ((fModeFlags
& UREGEX_CASE_INSENSITIVE
) != 0) {
1665 fSetOpStack
.push(setCaseClose
, *fStatus
);
1669 case doSetBeginUnion
:
1670 // We have scanned something like [[abc][
1671 // Need to handle the union operation explicitly [[abc] | [
1672 setPushOp(setUnion
);
1673 fSetOpStack
.push(setStart
, *fStatus
);
1674 if ((fModeFlags
& UREGEX_CASE_INSENSITIVE
) != 0) {
1675 fSetOpStack
.push(setCaseClose
, *fStatus
);
1679 case doSetDifference2
:
1680 // We have scanned something like [abc--
1681 // Consider this to unambiguously be a set difference operator.
1682 setPushOp(setDifference2
);
1686 // Have encountered the ']' that closes a set.
1687 // Force the evaluation of any pending operations within this set,
1688 // leave the completed set on the top of the set stack.
1690 U_ASSERT(fSetOpStack
.peeki()==setStart
);
1696 // Finished a complete set expression, including all nested sets.
1697 // The close bracket has already triggered clearing out pending set operators,
1698 // the operator stack should be empty and the operand stack should have just
1699 // one entry, the result set.
1700 U_ASSERT(fSetOpStack
.empty());
1701 UnicodeSet
*theSet
= (UnicodeSet
*)fSetStack
.pop();
1702 U_ASSERT(fSetStack
.empty());
1707 case doSetIntersection2
:
1708 // Have scanned something like [abc&&
1709 setPushOp(setIntersection2
);
1713 // Union the just-scanned literal character into the set being built.
1714 // This operation is the highest precedence set operation, so we can always do
1715 // it immediately, without waiting to see what follows. It is necessary to perform
1716 // any pending '-' or '&' operation first, because these have the same precedence
1717 // as union-ing in a literal'
1720 UnicodeSet
*s
= (UnicodeSet
*)fSetStack
.peek();
1722 fLastSetLiteral
= fC
.fChar
;
1726 case doSetLiteralEscaped
:
1727 // A back-slash escaped literal character was encountered.
1728 // Processing is the same as with setLiteral, above, with the addition of
1729 // the optional check for errors on escaped ASCII letters.
1731 if ((fModeFlags
& UREGEX_ERROR_ON_UNKNOWN_ESCAPES
) != 0 &&
1732 ((fC
.fChar
>= 0x41 && fC
.fChar
<= 0x5A) || // in [A-Z]
1733 (fC
.fChar
>= 0x61 && fC
.fChar
<= 0x7a))) { // in [a-z]
1734 error(U_REGEX_BAD_ESCAPE_SEQUENCE
);
1737 UnicodeSet
*s
= (UnicodeSet
*)fSetStack
.peek();
1739 fLastSetLiteral
= fC
.fChar
;
1743 case doSetNamedChar
:
1744 // Scanning a \N{UNICODE CHARACTER NAME}
1745 // Aside from the source of the character, the processing is identical to doSetLiteral,
1748 UChar32 c
= scanNamedChar();
1750 UnicodeSet
*s
= (UnicodeSet
*)fSetStack
.peek();
1752 fLastSetLiteral
= c
;
1756 case doSetNamedRange
:
1757 // We have scanned literal-\N{CHAR NAME}. Add the range to the set.
1758 // The left character is already in the set, and is saved in fLastSetLiteral.
1759 // The right side needs to be picked up, the scan is at the 'N'.
1760 // Lower Limit > Upper limit being an error matches both Java
1761 // and ICU UnicodeSet behavior.
1763 UChar32 c
= scanNamedChar();
1764 if (U_SUCCESS(*fStatus
) && (fLastSetLiteral
== U_SENTINEL
|| fLastSetLiteral
> c
)) {
1765 error(U_REGEX_INVALID_RANGE
);
1767 UnicodeSet
*s
= (UnicodeSet
*)fSetStack
.peek();
1768 s
->add(fLastSetLiteral
, c
);
1769 fLastSetLiteral
= c
;
1775 // Scanned a '^' at the start of a set.
1776 // Push the negation operator onto the set op stack.
1777 // A twist for case-insensitive matching:
1778 // the case closure operation must happen _before_ negation.
1779 // But the case closure operation will already be on the stack if it's required.
1780 // This requires checking for case closure, and swapping the stack order
1781 // if it is present.
1783 int32_t tosOp
= fSetOpStack
.peeki();
1784 if (tosOp
== setCaseClose
) {
1786 fSetOpStack
.push(setNegation
, *fStatus
);
1787 fSetOpStack
.push(setCaseClose
, *fStatus
);
1789 fSetOpStack
.push(setNegation
, *fStatus
);
1794 case doSetNoCloseError
:
1795 error(U_REGEX_MISSING_CLOSE_BRACKET
);
1799 error(U_REGEX_RULE_SYNTAX
); // -- or && at the end of a set. Illegal.
1802 case doSetPosixProp
:
1804 UnicodeSet
*s
= scanPosixProp();
1806 UnicodeSet
*tos
= (UnicodeSet
*)fSetStack
.peek();
1809 } // else error. scanProp() reported the error status already.
1814 // Scanned a \p \P within [brackets].
1816 UnicodeSet
*s
= scanProp();
1818 UnicodeSet
*tos
= (UnicodeSet
*)fSetStack
.peek();
1821 } // else error. scanProp() reported the error status already.
1827 // We have scanned literal-literal. Add the range to the set.
1828 // The left character is already in the set, and is saved in fLastSetLiteral.
1829 // The right side is the current character.
1830 // Lower Limit > Upper limit being an error matches both Java
1831 // and ICU UnicodeSet behavior.
1834 if (fLastSetLiteral
== U_SENTINEL
|| fLastSetLiteral
> fC
.fChar
) {
1835 error(U_REGEX_INVALID_RANGE
);
1837 UnicodeSet
*s
= (UnicodeSet
*)fSetStack
.peek();
1838 s
->add(fLastSetLiteral
, fC
.fChar
);
1846 if (U_FAILURE(*fStatus
)) {
1855 //------------------------------------------------------------------------------
1857 // literalChar We've encountered a literal character from the pattern,
1858 // or an escape sequence that reduces to a character.
1859 // Add it to the string containing all literal chars/strings from
1862 //------------------------------------------------------------------------------
1863 void RegexCompile::literalChar(UChar32 c
) {
1864 fLiteralChars
.append(c
);
1868 //------------------------------------------------------------------------------
1870 // fixLiterals When compiling something that can follow a literal
1871 // string in a pattern, emit the code to match the
1872 // accumulated literal string.
1874 // Optionally, split the last char of the string off into
1875 // a single "ONE_CHAR" operation, so that quantifiers can
1876 // apply to that char alone. Example: abc*
1877 // The * must apply to the 'c' only.
1879 //------------------------------------------------------------------------------
1880 void RegexCompile::fixLiterals(UBool split
) {
1882 // If no literal characters have been scanned but not yet had code generated
1883 // for them, nothing needs to be done.
1884 if (fLiteralChars
.length() == 0) {
1888 int32_t indexOfLastCodePoint
= fLiteralChars
.moveIndex32(fLiteralChars
.length(), -1);
1889 UChar32 lastCodePoint
= fLiteralChars
.char32At(indexOfLastCodePoint
);
1891 // Split: We need to ensure that the last item in the compiled pattern
1892 // refers only to the last literal scanned in the pattern, so that
1893 // quantifiers (*, +, etc.) affect only it, and not a longer string.
1894 // Split before case folding for case insensitive matches.
1897 fLiteralChars
.truncate(indexOfLastCodePoint
);
1898 fixLiterals(FALSE
); // Recursive call, emit code to match the first part of the string.
1899 // Note that the truncated literal string may be empty, in which case
1900 // nothing will be emitted.
1902 literalChar(lastCodePoint
); // Re-add the last code point as if it were a new literal.
1903 fixLiterals(FALSE
); // Second recursive call, code for the final code point.
1907 // If we are doing case-insensitive matching, case fold the string. This may expand
1908 // the string, e.g. the German sharp-s turns into "ss"
1909 if (fModeFlags
& UREGEX_CASE_INSENSITIVE
) {
1910 fLiteralChars
.foldCase();
1911 indexOfLastCodePoint
= fLiteralChars
.moveIndex32(fLiteralChars
.length(), -1);
1912 lastCodePoint
= fLiteralChars
.char32At(indexOfLastCodePoint
);
1915 if (indexOfLastCodePoint
== 0) {
1916 // Single character, emit a URX_ONECHAR op to match it.
1917 if ((fModeFlags
& UREGEX_CASE_INSENSITIVE
) &&
1918 u_hasBinaryProperty(lastCodePoint
, UCHAR_CASE_SENSITIVE
)) {
1919 appendOp(URX_ONECHAR_I
, lastCodePoint
);
1921 appendOp(URX_ONECHAR
, lastCodePoint
);
1924 // Two or more chars, emit a URX_STRING to match them.
1925 if (fLiteralChars
.length() > 0x00ffffff || fRXPat
->fLiteralText
.length() > 0x00ffffff) {
1926 error(U_REGEX_PATTERN_TOO_BIG
);
1928 if (fModeFlags
& UREGEX_CASE_INSENSITIVE
) {
1929 appendOp(URX_STRING_I
, fRXPat
->fLiteralText
.length());
1931 // TODO here: add optimization to split case sensitive strings of length two
1932 // into two single char ops, for efficiency.
1933 appendOp(URX_STRING
, fRXPat
->fLiteralText
.length());
1935 appendOp(URX_STRING_LEN
, fLiteralChars
.length());
1937 // Add this string into the accumulated strings of the compiled pattern.
1938 fRXPat
->fLiteralText
.append(fLiteralChars
);
1941 fLiteralChars
.remove();
1945 int32_t RegexCompile::buildOp(int32_t type
, int32_t val
) {
1946 if (U_FAILURE(*fStatus
)) {
1949 if (type
< 0 || type
> 255) {
1952 if (val
> 0x00ffffff) {
1956 if (!(type
== URX_RESERVED_OP_N
|| type
== URX_RESERVED_OP
)) {
1959 if (URX_TYPE(val
) != 0xff) {
1962 type
= URX_RESERVED_OP_N
;
1964 return (type
<< 24) | val
;
1968 //------------------------------------------------------------------------------
1970 // appendOp() Append a new instruction onto the compiled pattern
1971 // Includes error checking, limiting the size of the
1972 // pattern to lengths that can be represented in the
1973 // 24 bit operand field of an instruction.
1975 //------------------------------------------------------------------------------
1976 void RegexCompile::appendOp(int32_t op
) {
1977 if (U_FAILURE(*fStatus
)) {
1980 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1981 if ((fRXPat
->fCompiledPat
->size() > 0x00fffff0) && U_SUCCESS(*fStatus
)) {
1982 error(U_REGEX_PATTERN_TOO_BIG
);
1986 void RegexCompile::appendOp(int32_t type
, int32_t val
) {
1987 appendOp(buildOp(type
, val
));
1991 //------------------------------------------------------------------------------
1993 // insertOp() Insert a slot for a new opcode into the already
1994 // compiled pattern code.
1996 // Fill the slot with a NOP. Our caller will replace it
1997 // with what they really wanted.
1999 //------------------------------------------------------------------------------
2000 void RegexCompile::insertOp(int32_t where
) {
2001 UVector64
*code
= fRXPat
->fCompiledPat
;
2002 U_ASSERT(where
>0 && where
< code
->size());
2004 int32_t nop
= buildOp(URX_NOP
, 0);
2005 code
->insertElementAt(nop
, where
, *fStatus
);
2007 // Walk through the pattern, looking for any ops with targets that
2008 // were moved down by the insert. Fix them.
2010 for (loc
=0; loc
<code
->size(); loc
++) {
2011 int32_t op
= (int32_t)code
->elementAti(loc
);
2012 int32_t opType
= URX_TYPE(op
);
2013 int32_t opValue
= URX_VAL(op
);
2014 if ((opType
== URX_JMP
||
2015 opType
== URX_JMPX
||
2016 opType
== URX_STATE_SAVE
||
2017 opType
== URX_CTR_LOOP
||
2018 opType
== URX_CTR_LOOP_NG
||
2019 opType
== URX_JMP_SAV
||
2020 opType
== URX_JMP_SAV_X
||
2021 opType
== URX_RELOC_OPRND
) && opValue
> where
) {
2022 // Target location for this opcode is after the insertion point and
2023 // needs to be incremented to adjust for the insertion.
2025 op
= buildOp(opType
, opValue
);
2026 code
->setElementAt(op
, loc
);
2030 // Now fix up the parentheses stack. All positive values in it are locations in
2031 // the compiled pattern. (Negative values are frame boundaries, and don't need fixing.)
2032 for (loc
=0; loc
<fParenStack
.size(); loc
++) {
2033 int32_t x
= fParenStack
.elementAti(loc
);
2034 U_ASSERT(x
< code
->size());
2037 fParenStack
.setElementAt(x
, loc
);
2041 if (fMatchCloseParen
> where
) {
2044 if (fMatchOpenParen
> where
) {
2050 //------------------------------------------------------------------------------
2052 // allocateData() Allocate storage in the matcher's static data area.
2053 // Return the index for the newly allocated data.
2054 // The storage won't actually exist until we are running a match
2055 // operation, but the storage indexes are inserted into various
2056 // opcodes while compiling the pattern.
2058 //------------------------------------------------------------------------------
2059 int32_t RegexCompile::allocateData(int32_t size
) {
2060 if (U_FAILURE(*fStatus
)) {
2063 if (size
<= 0 || size
> 0x100 || fRXPat
->fDataSize
< 0) {
2064 error(U_REGEX_INTERNAL_ERROR
);
2067 int32_t dataIndex
= fRXPat
->fDataSize
;
2068 fRXPat
->fDataSize
+= size
;
2069 if (fRXPat
->fDataSize
>= 0x00fffff0) {
2070 error(U_REGEX_INTERNAL_ERROR
);
2076 //------------------------------------------------------------------------------
2078 // allocateStackData() Allocate space in the back-tracking stack frame.
2079 // Return the index for the newly allocated data.
2080 // The frame indexes are inserted into various
2081 // opcodes while compiling the pattern, meaning that frame
2082 // size must be restricted to the size that will fit
2083 // as an operand (24 bits).
2085 //------------------------------------------------------------------------------
2086 int32_t RegexCompile::allocateStackData(int32_t size
) {
2087 if (U_FAILURE(*fStatus
)) {
2090 if (size
<= 0 || size
> 0x100 || fRXPat
->fFrameSize
< 0) {
2091 error(U_REGEX_INTERNAL_ERROR
);
2094 int32_t dataIndex
= fRXPat
->fFrameSize
;
2095 fRXPat
->fFrameSize
+= size
;
2096 if (fRXPat
->fFrameSize
>= 0x00fffff0) {
2097 error(U_REGEX_PATTERN_TOO_BIG
);
2103 //------------------------------------------------------------------------------
2105 // blockTopLoc() Find or create a location in the compiled pattern
2106 // at the start of the operation or block that has
2107 // just been compiled. Needed when a quantifier (* or
2108 // whatever) appears, and we need to add an operation
2109 // at the start of the thing being quantified.
2111 // (Parenthesized Blocks) have a slot with a NOP that
2112 // is reserved for this purpose. .* or similar don't
2113 // and a slot needs to be added.
2115 // parameter reserveLoc : TRUE - ensure that there is space to add an opcode
2116 // at the returned location.
2117 // FALSE - just return the address,
2118 // do not reserve a location there.
2120 //------------------------------------------------------------------------------
2121 int32_t RegexCompile::blockTopLoc(UBool reserveLoc
) {
2123 fixLiterals(TRUE
); // Emit code for any pending literals.
2124 // If last item was a string, emit separate op for the its last char.
2125 if (fRXPat
->fCompiledPat
->size() == fMatchCloseParen
)
2127 // The item just processed is a parenthesized block.
2128 theLoc
= fMatchOpenParen
; // A slot is already reserved for us.
2129 U_ASSERT(theLoc
> 0);
2130 U_ASSERT(URX_TYPE(((uint32_t)fRXPat
->fCompiledPat
->elementAti(theLoc
))) == URX_NOP
);
2133 // Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
2134 // No slot for STATE_SAVE was pre-reserved in the compiled code.
2135 // We need to make space now.
2136 theLoc
= fRXPat
->fCompiledPat
->size()-1;
2137 int32_t opAtTheLoc
= (int32_t)fRXPat
->fCompiledPat
->elementAti(theLoc
);
2138 if (URX_TYPE(opAtTheLoc
) == URX_STRING_LEN
) {
2139 // Strings take two opcode, we want the position of the first one.
2140 // We can have a string at this point if a single character case-folded to two.
2144 int32_t nop
= buildOp(URX_NOP
, 0);
2145 fRXPat
->fCompiledPat
->insertElementAt(nop
, theLoc
, *fStatus
);
2153 //------------------------------------------------------------------------------
2155 // handleCloseParen When compiling a close paren, we need to go back
2156 // and fix up any JMP or SAVE operations within the
2157 // parenthesized block that need to target the end
2158 // of the block. The locations of these are kept on
2159 // the paretheses stack.
2161 // This function is called both when encountering a
2162 // real ) and at the end of the pattern.
2164 //------------------------------------------------------------------------------
2165 void RegexCompile::handleCloseParen() {
2168 if (fParenStack
.size() <= 0) {
2169 error(U_REGEX_MISMATCHED_PAREN
);
2173 // Emit code for any pending literals.
2176 // Fixup any operations within the just-closed parenthesized group
2177 // that need to reference the end of the (block).
2178 // (The first one popped from the stack is an unused slot for
2179 // alternation (OR) state save, but applying the fixup to it does no harm.)
2181 patIdx
= fParenStack
.popi();
2183 // value < 0 flags the start of the frame on the paren stack.
2186 U_ASSERT(patIdx
>0 && patIdx
<= fRXPat
->fCompiledPat
->size());
2187 patOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(patIdx
);
2188 U_ASSERT(URX_VAL(patOp
) == 0); // Branch target for JMP should not be set.
2189 patOp
|= fRXPat
->fCompiledPat
->size(); // Set it now.
2190 fRXPat
->fCompiledPat
->setElementAt(patOp
, patIdx
);
2191 fMatchOpenParen
= patIdx
;
2194 // At the close of any parenthesized block, restore the match mode flags to
2195 // the value they had at the open paren. Saved value is
2196 // at the top of the paren stack.
2197 fModeFlags
= fParenStack
.popi();
2198 U_ASSERT(fModeFlags
< 0);
2200 // DO any additional fixups, depending on the specific kind of
2201 // parentesized grouping this is
2206 // No additional fixups required.
2207 // (Grouping-only parentheses)
2210 // Capturing Parentheses.
2211 // Insert a End Capture op into the pattern.
2212 // The frame offset of the variables for this cg is obtained from the
2213 // start capture op and put it into the end-capture op.
2215 int32_t captureOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
+1);
2216 U_ASSERT(URX_TYPE(captureOp
) == URX_START_CAPTURE
);
2218 int32_t frameVarLocation
= URX_VAL(captureOp
);
2219 appendOp(URX_END_CAPTURE
, frameVarLocation
);
2223 // Atomic Parenthesis.
2224 // Insert a LD_SP operation to restore the state stack to the position
2225 // it was when the atomic parens were entered.
2227 int32_t stoOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
+1);
2228 U_ASSERT(URX_TYPE(stoOp
) == URX_STO_SP
);
2229 int32_t stoLoc
= URX_VAL(stoOp
);
2230 appendOp(URX_LD_SP
, stoLoc
);
2236 int32_t startOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
-5);
2237 U_ASSERT(URX_TYPE(startOp
) == URX_LA_START
);
2238 int32_t dataLoc
= URX_VAL(startOp
);
2239 appendOp(URX_LA_END
, dataLoc
);
2245 // See comment at doOpenLookAheadNeg
2246 int32_t startOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
-1);
2247 U_ASSERT(URX_TYPE(startOp
) == URX_LA_START
);
2248 int32_t dataLoc
= URX_VAL(startOp
);
2249 appendOp(URX_LA_END
, dataLoc
);
2250 appendOp(URX_BACKTRACK
, 0);
2251 appendOp(URX_LA_END
, dataLoc
);
2253 // Patch the URX_SAVE near the top of the block.
2254 // The destination of the SAVE is the final LA_END that was just added.
2255 int32_t saveOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
);
2256 U_ASSERT(URX_TYPE(saveOp
) == URX_STATE_SAVE
);
2257 int32_t dest
= fRXPat
->fCompiledPat
->size()-1;
2258 saveOp
= buildOp(URX_STATE_SAVE
, dest
);
2259 fRXPat
->fCompiledPat
->setElementAt(saveOp
, fMatchOpenParen
);
2265 // See comment at doOpenLookBehind.
2267 // Append the URX_LB_END and URX_LA_END to the compiled pattern.
2268 int32_t startOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
-4);
2269 U_ASSERT(URX_TYPE(startOp
) == URX_LB_START
);
2270 int32_t dataLoc
= URX_VAL(startOp
);
2271 appendOp(URX_LB_END
, dataLoc
);
2272 appendOp(URX_LA_END
, dataLoc
);
2274 // Determine the min and max bounds for the length of the
2275 // string that the pattern can match.
2276 // An unbounded upper limit is an error.
2277 int32_t patEnd
= fRXPat
->fCompiledPat
->size() - 1;
2278 int32_t minML
= minMatchLength(fMatchOpenParen
, patEnd
);
2279 int32_t maxML
= maxMatchLength(fMatchOpenParen
, patEnd
);
2280 if (URX_TYPE(maxML
) != 0) {
2281 error(U_REGEX_LOOK_BEHIND_LIMIT
);
2284 if (maxML
== INT32_MAX
) {
2285 error(U_REGEX_LOOK_BEHIND_LIMIT
);
2288 if (minML
== INT32_MAX
&& maxML
== 0) {
2289 // This condition happens when no match is possible, such as with a
2290 // [set] expression containing no elements.
2291 // In principle, the generated code to evaluate the expression could be deleted,
2292 // but it's probably not worth the complication.
2295 U_ASSERT(minML
<= maxML
);
2297 // Insert the min and max match len bounds into the URX_LB_CONT op that
2298 // appears at the top of the look-behind block, at location fMatchOpenParen+1
2299 fRXPat
->fCompiledPat
->setElementAt(minML
, fMatchOpenParen
-2);
2300 fRXPat
->fCompiledPat
->setElementAt(maxML
, fMatchOpenParen
-1);
2309 // See comment at doOpenLookBehindNeg.
2311 // Append the URX_LBN_END to the compiled pattern.
2312 int32_t startOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
-5);
2313 U_ASSERT(URX_TYPE(startOp
) == URX_LB_START
);
2314 int32_t dataLoc
= URX_VAL(startOp
);
2315 appendOp(URX_LBN_END
, dataLoc
);
2317 // Determine the min and max bounds for the length of the
2318 // string that the pattern can match.
2319 // An unbounded upper limit is an error.
2320 int32_t patEnd
= fRXPat
->fCompiledPat
->size() - 1;
2321 int32_t minML
= minMatchLength(fMatchOpenParen
, patEnd
);
2322 int32_t maxML
= maxMatchLength(fMatchOpenParen
, patEnd
);
2323 if (URX_TYPE(maxML
) != 0) {
2324 error(U_REGEX_LOOK_BEHIND_LIMIT
);
2327 if (maxML
== INT32_MAX
) {
2328 error(U_REGEX_LOOK_BEHIND_LIMIT
);
2331 if (minML
== INT32_MAX
&& maxML
== 0) {
2332 // This condition happens when no match is possible, such as with a
2333 // [set] expression containing no elements.
2334 // In principle, the generated code to evaluate the expression could be deleted,
2335 // but it's probably not worth the complication.
2339 U_ASSERT(minML
<= maxML
);
2341 // Insert the min and max match len bounds into the URX_LB_CONT op that
2342 // appears at the top of the look-behind block, at location fMatchOpenParen+1
2343 fRXPat
->fCompiledPat
->setElementAt(minML
, fMatchOpenParen
-3);
2344 fRXPat
->fCompiledPat
->setElementAt(maxML
, fMatchOpenParen
-2);
2346 // Insert the pattern location to continue at after a successful match
2347 // as the last operand of the URX_LBN_CONT
2348 int32_t op
= buildOp(URX_RELOC_OPRND
, fRXPat
->fCompiledPat
->size());
2349 fRXPat
->fCompiledPat
->setElementAt(op
, fMatchOpenParen
-1);
2359 // remember the next location in the compiled pattern.
2360 // The compilation of Quantifiers will look at this to see whether its looping
2361 // over a parenthesized block or a single item
2362 fMatchCloseParen
= fRXPat
->fCompiledPat
->size();
2367 //------------------------------------------------------------------------------
2369 // compileSet Compile the pattern operations for a reference to a
2372 //------------------------------------------------------------------------------
2373 void RegexCompile::compileSet(UnicodeSet
*theSet
)
2375 if (theSet
== NULL
) {
2378 // Remove any strings from the set.
2379 // There shoudn't be any, but just in case.
2380 // (Case Closure can add them; if we had a simple case closure avaialble that
2381 // ignored strings, that would be better.)
2382 theSet
->removeAllStrings();
2383 int32_t setSize
= theSet
->size();
2388 // Set of no elements. Always fails to match.
2389 appendOp(URX_BACKTRACK
, 0);
2396 // The set contains only a single code point. Put it into
2397 // the compiled pattern as a single char operation rather
2398 // than a set, and discard the set itself.
2399 literalChar(theSet
->charAt(0));
2406 // The set contains two or more chars. (the normal case)
2407 // Put it into the compiled pattern as a set.
2408 int32_t setNumber
= fRXPat
->fSets
->size();
2409 fRXPat
->fSets
->addElement(theSet
, *fStatus
);
2410 appendOp(URX_SETREF
, setNumber
);
2416 //------------------------------------------------------------------------------
2418 // compileInterval Generate the code for a {min, max} style interval quantifier.
2419 // Except for the specific opcodes used, the code is the same
2420 // for all three types (greedy, non-greedy, possessive) of
2421 // intervals. The opcodes are supplied as parameters.
2422 // (There are two sets of opcodes - greedy & possessive use the
2423 // same ones, while non-greedy has it's own.)
2425 // The code for interval loops has this form:
2426 // 0 CTR_INIT counter loc (in stack frame)
2427 // 1 5 patt address of CTR_LOOP at bottom of block
2429 // 3 max count (-1 for unbounded)
2430 // 4 ... block to be iterated over
2434 //------------------------------------------------------------------------------
2435 void RegexCompile::compileInterval(int32_t InitOp
, int32_t LoopOp
)
2437 // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
2438 // four slots in the compiled code. Reserve them.
2439 int32_t topOfBlock
= blockTopLoc(TRUE
);
2440 insertOp(topOfBlock
);
2441 insertOp(topOfBlock
);
2442 insertOp(topOfBlock
);
2444 // The operands for the CTR_INIT opcode include the index in the matcher data
2445 // of the counter. Allocate it now. There are two data items
2446 // counterLoc --> Loop counter
2447 // +1 --> Input index (for breaking non-progressing loops)
2448 // (Only present if unbounded upper limit on loop)
2449 int32_t dataSize
= fIntervalUpper
< 0 ? 2 : 1;
2450 int32_t counterLoc
= allocateStackData(dataSize
);
2452 int32_t op
= buildOp(InitOp
, counterLoc
);
2453 fRXPat
->fCompiledPat
->setElementAt(op
, topOfBlock
);
2455 // The second operand of CTR_INIT is the location following the end of the loop.
2456 // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
2457 // compilation of something later on causes the code to grow and the target
2458 // position to move.
2459 int32_t loopEnd
= fRXPat
->fCompiledPat
->size();
2460 op
= buildOp(URX_RELOC_OPRND
, loopEnd
);
2461 fRXPat
->fCompiledPat
->setElementAt(op
, topOfBlock
+1);
2463 // Followed by the min and max counts.
2464 fRXPat
->fCompiledPat
->setElementAt(fIntervalLow
, topOfBlock
+2);
2465 fRXPat
->fCompiledPat
->setElementAt(fIntervalUpper
, topOfBlock
+3);
2467 // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op.
2468 // Goes at end of the block being looped over, so just append to the code so far.
2469 appendOp(LoopOp
, topOfBlock
);
2471 if ((fIntervalLow
& 0xff000000) != 0 ||
2472 (fIntervalUpper
> 0 && (fIntervalUpper
& 0xff000000) != 0)) {
2473 error(U_REGEX_NUMBER_TOO_BIG
);
2476 if (fIntervalLow
> fIntervalUpper
&& fIntervalUpper
!= -1) {
2477 error(U_REGEX_MAX_LT_MIN
);
2483 UBool
RegexCompile::compileInlineInterval() {
2484 if (fIntervalUpper
> 10 || fIntervalUpper
< fIntervalLow
) {
2485 // Too big to inline. Fail, which will cause looping code to be generated.
2486 // (Upper < Lower picks up unbounded upper and errors, both.)
2490 int32_t topOfBlock
= blockTopLoc(FALSE
);
2491 if (fIntervalUpper
== 0) {
2492 // Pathological case. Attempt no matches, as if the block doesn't exist.
2493 // Discard the generated code for the block.
2494 // If the block included parens, discard the info pertaining to them as well.
2495 fRXPat
->fCompiledPat
->setSize(topOfBlock
);
2496 if (fMatchOpenParen
>= topOfBlock
) {
2497 fMatchOpenParen
= -1;
2499 if (fMatchCloseParen
>= topOfBlock
) {
2500 fMatchCloseParen
= -1;
2505 if (topOfBlock
!= fRXPat
->fCompiledPat
->size()-1 && fIntervalUpper
!= 1) {
2506 // The thing being repeated is not a single op, but some
2507 // more complex block. Do it as a loop, not inlines.
2508 // Note that things "repeated" a max of once are handled as inline, because
2509 // the one copy of the code already generated is just fine.
2513 // Pick up the opcode that is to be repeated
2515 int32_t op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(topOfBlock
);
2517 // Compute the pattern location where the inline sequence
2518 // will end, and set up the state save op that will be needed.
2520 int32_t endOfSequenceLoc
= fRXPat
->fCompiledPat
->size()-1
2521 + fIntervalUpper
+ (fIntervalUpper
-fIntervalLow
);
2522 int32_t saveOp
= buildOp(URX_STATE_SAVE
, endOfSequenceLoc
);
2523 if (fIntervalLow
== 0) {
2524 insertOp(topOfBlock
);
2525 fRXPat
->fCompiledPat
->setElementAt(saveOp
, topOfBlock
);
2530 // Loop, emitting the op for the thing being repeated each time.
2531 // Loop starts at 1 because one instance of the op already exists in the pattern,
2532 // it was put there when it was originally encountered.
2534 for (i
=1; i
<fIntervalUpper
; i
++ ) {
2535 if (i
>= fIntervalLow
) {
2545 //------------------------------------------------------------------------------
2547 // caseInsensitiveStart given a single code point from a pattern string, determine the
2548 // set of characters that could potentially begin a case-insensitive
2549 // match of a string beginning with that character, using full Unicode
2550 // case insensitive matching.
2552 // This is used in optimizing find().
2554 // closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
2555 // misses cases like this:
2556 // A string from the pattern begins with 'ss' (although all we know
2557 // in this context is that it begins with 's')
2558 // The pattern could match a string beginning with a German sharp-s
2560 // To the ordinary case closure for a character c, we add all other
2561 // characters cx where the case closure of cx incudes a string form that begins
2562 // with the original character c.
2564 // This function could be made smarter. The full pattern string is available
2565 // and it would be possible to verify that the extra characters being added
2566 // to the starting set fully match, rather than having just a first-char of the
2567 // folded form match.
2569 //------------------------------------------------------------------------------
2570 void RegexCompile::findCaseInsensitiveStarters(UChar32 c
, UnicodeSet
*starterChars
) {
2572 // Machine Generated below.
2573 // It may need updating with new versions of Unicode.
2574 // Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed.
2575 // The update tool is here: svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
2577 // Machine Generated Data. Do not hand edit.
2578 static const UChar32 RECaseFixCodePoints
[] = {
2579 0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc,
2580 0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565,
2581 0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07,
2582 0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61,
2583 0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000};
2585 static const int16_t RECaseFixStringOffsets
[] = {
2586 0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10,
2587 0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f,
2588 0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43,
2589 0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57,
2590 0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0};
2592 static const int16_t RECaseFixCounts
[] = {
2593 0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1,
2594 0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1,
2595 0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2596 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2597 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0};
2599 static const UChar RECaseFixData
[] = {
2600 0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf,
2601 0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3,
2602 0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3,
2603 0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0x1fe3,
2604 0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0xfb14,
2605 0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83,
2606 0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90,
2607 0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95,
2608 0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2,
2609 0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7,
2610 0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0};
2612 // End of machine generated data.
2614 if (c
< UCHAR_MIN_VALUE
|| c
> UCHAR_MAX_VALUE
) {
2615 // This function should never be called with an invalid input character.
2617 } else if (u_hasBinaryProperty(c
, UCHAR_CASE_SENSITIVE
)) {
2618 UChar32 caseFoldedC
= u_foldCase(c
, U_FOLD_CASE_DEFAULT
);
2619 starterChars
->set(caseFoldedC
, caseFoldedC
);
2622 for (i
=0; RECaseFixCodePoints
[i
]<c
; i
++) {
2623 // Simple linear search through the sorted list of interesting code points.
2626 if (RECaseFixCodePoints
[i
] == c
) {
2627 int32_t dataIndex
= RECaseFixStringOffsets
[i
];
2628 int32_t numCharsToAdd
= RECaseFixCounts
[i
];
2629 UChar32 cpToAdd
= 0;
2630 for (int32_t j
=0; j
<numCharsToAdd
; j
++) {
2631 U16_NEXT_UNSAFE(RECaseFixData
, dataIndex
, cpToAdd
);
2632 starterChars
->add(cpToAdd
);
2636 starterChars
->closeOver(USET_CASE_INSENSITIVE
);
2637 starterChars
->removeAllStrings();
2639 // Not a cased character. Just return it alone.
2640 starterChars
->set(c
, c
);
2645 // Increment with overflow check.
2646 // val and delta will both be positive.
2648 static int32_t safeIncrement(int32_t val
, int32_t delta
) {
2649 if (INT32_MAX
- val
> delta
) {
2657 //------------------------------------------------------------------------------
2659 // matchStartType Determine how a match can start.
2660 // Used to optimize find() operations.
2662 // Operation is very similar to minMatchLength(). Walk the compiled
2663 // pattern, keeping an on-going minimum-match-length. For any
2664 // op where the min match coming in is zero, add that ops possible
2665 // starting matches to the possible starts for the overall pattern.
2667 //------------------------------------------------------------------------------
2668 void RegexCompile::matchStartType() {
2669 if (U_FAILURE(*fStatus
)) {
2674 int32_t loc
; // Location in the pattern of the current op being processed.
2675 int32_t op
; // The op being processed
2676 int32_t opType
; // The opcode type of the op
2677 int32_t currentLen
= 0; // Minimum length of a match to this point (loc) in the pattern
2678 int32_t numInitialStrings
= 0; // Number of strings encountered that could match at start.
2680 UBool atStart
= TRUE
; // True if no part of the pattern yet encountered
2681 // could have advanced the position in a match.
2682 // (Maximum match length so far == 0)
2684 // forwardedLength is a vector holding minimum-match-length values that
2685 // are propagated forward in the pattern by JMP or STATE_SAVE operations.
2686 // It must be one longer than the pattern being checked because some ops
2687 // will jmp to a end-of-block+1 location from within a block, and we must
2688 // count those when checking the block.
2689 int32_t end
= fRXPat
->fCompiledPat
->size();
2690 UVector32
forwardedLength(end
+1, *fStatus
);
2691 forwardedLength
.setSize(end
+1);
2692 for (loc
=3; loc
<end
; loc
++) {
2693 forwardedLength
.setElementAt(INT32_MAX
, loc
);
2696 for (loc
= 3; loc
<end
; loc
++) {
2697 op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
2698 opType
= URX_TYPE(op
);
2700 // The loop is advancing linearly through the pattern.
2701 // If the op we are now at was the destination of a branch in the pattern,
2702 // and that path has a shorter minimum length than the current accumulated value,
2703 // replace the current accumulated value.
2704 if (forwardedLength
.elementAti(loc
) < currentLen
) {
2705 currentLen
= forwardedLength
.elementAti(loc
);
2706 U_ASSERT(currentLen
>=0 && currentLen
< INT32_MAX
);
2710 // Ops that don't change the total length matched
2711 case URX_RESERVED_OP
:
2714 case URX_STRING_LEN
:
2716 case URX_START_CAPTURE
:
2717 case URX_END_CAPTURE
:
2718 case URX_BACKSLASH_B
:
2719 case URX_BACKSLASH_BU
:
2720 case URX_BACKSLASH_G
:
2721 case URX_BACKSLASH_Z
:
2726 case URX_RELOC_OPRND
:
2727 case URX_STO_INP_LOC
:
2728 case URX_BACKREF
: // BackRef. Must assume that it might be a zero length match
2731 case URX_STO_SP
: // Setup for atomic or possessive blocks. Doesn't change what can match.
2737 fRXPat
->fStartType
= START_START
;
2742 case URX_CARET_M_UNIX
:
2744 fRXPat
->fStartType
= START_LINE
;
2749 if (currentLen
== 0) {
2750 // This character could appear at the start of a match.
2751 // Add it to the set of possible starting characters.
2752 fRXPat
->fInitialChars
->add(URX_VAL(op
));
2753 numInitialStrings
+= 2;
2755 currentLen
= safeIncrement(currentLen
, 1);
2761 if (currentLen
== 0) {
2762 int32_t sn
= URX_VAL(op
);
2763 U_ASSERT(sn
> 0 && sn
< fRXPat
->fSets
->size());
2764 const UnicodeSet
*s
= (UnicodeSet
*)fRXPat
->fSets
->elementAt(sn
);
2765 fRXPat
->fInitialChars
->addAll(*s
);
2766 numInitialStrings
+= 2;
2768 currentLen
= safeIncrement(currentLen
, 1);
2773 // [Set]*, like a SETREF, above, in what it can match,
2774 // but may not match at all, so currentLen is not incremented.
2775 if (currentLen
== 0) {
2776 int32_t sn
= URX_VAL(op
);
2777 U_ASSERT(sn
> 0 && sn
< fRXPat
->fSets
->size());
2778 const UnicodeSet
*s
= (UnicodeSet
*)fRXPat
->fSets
->elementAt(sn
);
2779 fRXPat
->fInitialChars
->addAll(*s
);
2780 numInitialStrings
+= 2;
2785 case URX_LOOP_DOT_I
:
2786 if (currentLen
== 0) {
2787 // .* at the start of a pattern.
2788 // Any character can begin the match.
2789 fRXPat
->fInitialChars
->clear();
2790 fRXPat
->fInitialChars
->complement();
2791 numInitialStrings
+= 2;
2797 case URX_STATIC_SETREF
:
2798 if (currentLen
== 0) {
2799 int32_t sn
= URX_VAL(op
);
2800 U_ASSERT(sn
>0 && sn
<URX_LAST_SET
);
2801 const UnicodeSet
*s
= fRXPat
->fStaticSets
[sn
];
2802 fRXPat
->fInitialChars
->addAll(*s
);
2803 numInitialStrings
+= 2;
2805 currentLen
= safeIncrement(currentLen
, 1);
2811 case URX_STAT_SETREF_N
:
2812 if (currentLen
== 0) {
2813 int32_t sn
= URX_VAL(op
);
2814 const UnicodeSet
*s
= fRXPat
->fStaticSets
[sn
];
2817 fRXPat
->fInitialChars
->addAll(sc
);
2818 numInitialStrings
+= 2;
2820 currentLen
= safeIncrement(currentLen
, 1);
2826 case URX_BACKSLASH_D
:
2828 if (currentLen
== 0) {
2830 s
.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK
, U_GC_ND_MASK
, *fStatus
);
2831 if (URX_VAL(op
) != 0) {
2834 fRXPat
->fInitialChars
->addAll(s
);
2835 numInitialStrings
+= 2;
2837 currentLen
= safeIncrement(currentLen
, 1);
2842 case URX_BACKSLASH_H
:
2843 // Horiz white space
2844 if (currentLen
== 0) {
2846 s
.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK
, U_GC_ZS_MASK
, *fStatus
);
2847 s
.add((UChar32
)9); // Tab
2848 if (URX_VAL(op
) != 0) {
2851 fRXPat
->fInitialChars
->addAll(s
);
2852 numInitialStrings
+= 2;
2854 currentLen
= safeIncrement(currentLen
, 1);
2859 case URX_BACKSLASH_R
: // Any line ending sequence
2860 case URX_BACKSLASH_V
: // Any line ending code point, with optional negation
2861 if (currentLen
== 0) {
2863 s
.add((UChar32
)0x0a, (UChar32
)0x0d); // add range
2864 s
.add((UChar32
)0x85);
2865 s
.add((UChar32
)0x2028, (UChar32
)0x2029);
2866 if (URX_VAL(op
) != 0) {
2867 // Complement option applies to URX_BACKSLASH_V only.
2870 fRXPat
->fInitialChars
->addAll(s
);
2871 numInitialStrings
+= 2;
2873 currentLen
= safeIncrement(currentLen
, 1);
2880 // Case Insensitive Single Character.
2881 if (currentLen
== 0) {
2882 UChar32 c
= URX_VAL(op
);
2883 if (u_hasBinaryProperty(c
, UCHAR_CASE_SENSITIVE
)) {
2884 UnicodeSet
starters(c
, c
);
2885 starters
.closeOver(USET_CASE_INSENSITIVE
);
2886 // findCaseInsensitiveStarters(c, &starters);
2887 // For ONECHAR_I, no need to worry about text chars that expand on folding into strings.
2888 // The expanded folding can't match the pattern.
2889 fRXPat
->fInitialChars
->addAll(starters
);
2891 // Char has no case variants. Just add it as-is to the
2892 // set of possible starting chars.
2893 fRXPat
->fInitialChars
->add(c
);
2895 numInitialStrings
+= 2;
2897 currentLen
= safeIncrement(currentLen
, 1);
2902 case URX_BACKSLASH_X
: // Grahpeme Cluster. Minimum is 1, max unbounded.
2903 case URX_DOTANY_ALL
: // . matches one or two.
2905 case URX_DOTANY_UNIX
:
2906 if (currentLen
== 0) {
2907 // These constructs are all bad news when they appear at the start
2908 // of a match. Any character can begin the match.
2909 fRXPat
->fInitialChars
->clear();
2910 fRXPat
->fInitialChars
->complement();
2911 numInitialStrings
+= 2;
2913 currentLen
= safeIncrement(currentLen
, 1);
2919 loc
++; // Except for extra operand on URX_JMPX, same as URX_JMP.
2923 int32_t jmpDest
= URX_VAL(op
);
2924 if (jmpDest
< loc
) {
2925 // Loop of some kind. Can safely ignore, the worst that will happen
2926 // is that we understate the true minimum length
2927 currentLen
= forwardedLength
.elementAti(loc
+1);
2930 // Forward jump. Propagate the current min length to the target loc of the jump.
2931 U_ASSERT(jmpDest
<= end
+1);
2932 if (forwardedLength
.elementAti(jmpDest
) > currentLen
) {
2933 forwardedLength
.setElementAt(currentLen
, jmpDest
);
2942 // Combo of state save to the next loc, + jmp backwards.
2943 // Net effect on min. length computation is nothing.
2948 // Fails are kind of like a branch, except that the min length was
2949 // propagated already, by the state save.
2950 currentLen
= forwardedLength
.elementAti(loc
+1);
2955 case URX_STATE_SAVE
:
2957 // State Save, for forward jumps, propagate the current minimum.
2958 // of the state save.
2959 int32_t jmpDest
= URX_VAL(op
);
2960 if (jmpDest
> loc
) {
2961 if (currentLen
< forwardedLength
.elementAti(jmpDest
)) {
2962 forwardedLength
.setElementAt(currentLen
, jmpDest
);
2975 int32_t stringLenOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
2976 int32_t stringLen
= URX_VAL(stringLenOp
);
2977 U_ASSERT(URX_TYPE(stringLenOp
) == URX_STRING_LEN
);
2978 U_ASSERT(stringLenOp
>= 2);
2979 if (currentLen
== 0) {
2980 // Add the starting character of this string to the set of possible starting
2981 // characters for this pattern.
2982 int32_t stringStartIdx
= URX_VAL(op
);
2983 UChar32 c
= fRXPat
->fLiteralText
.char32At(stringStartIdx
);
2984 fRXPat
->fInitialChars
->add(c
);
2986 // Remember this string. After the entire pattern has been checked,
2987 // if nothing else is identified that can start a match, we'll use it.
2988 numInitialStrings
++;
2989 fRXPat
->fInitialStringIdx
= stringStartIdx
;
2990 fRXPat
->fInitialStringLen
= stringLen
;
2993 currentLen
= safeIncrement(currentLen
, stringLen
);
3000 // Case-insensitive string. Unlike exact-match strings, we won't
3001 // attempt a string search for possible match positions. But we
3002 // do update the set of possible starting characters.
3004 int32_t stringLenOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
3005 int32_t stringLen
= URX_VAL(stringLenOp
);
3006 U_ASSERT(URX_TYPE(stringLenOp
) == URX_STRING_LEN
);
3007 U_ASSERT(stringLenOp
>= 2);
3008 if (currentLen
== 0) {
3009 // Add the starting character of this string to the set of possible starting
3010 // characters for this pattern.
3011 int32_t stringStartIdx
= URX_VAL(op
);
3012 UChar32 c
= fRXPat
->fLiteralText
.char32At(stringStartIdx
);
3014 findCaseInsensitiveStarters(c
, &s
);
3015 fRXPat
->fInitialChars
->addAll(s
);
3016 numInitialStrings
+= 2; // Matching on an initial string not possible.
3018 currentLen
= safeIncrement(currentLen
, stringLen
);
3024 case URX_CTR_INIT_NG
:
3026 // Loop Init Ops. These don't change the min length, but they are 4 word ops
3027 // so location must be updated accordingly.
3029 // If the min loop count == 0
3030 // move loc forwards to the end of the loop, skipping over the body.
3031 // If the min count is > 0,
3032 // continue normal processing of the body of the loop.
3033 int32_t loopEndLoc
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
+1);
3034 loopEndLoc
= URX_VAL(loopEndLoc
);
3035 int32_t minLoopCount
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
+2);
3036 if (minLoopCount
== 0) {
3037 // Min Loop Count of 0, treat like a forward branch and
3038 // move the current minimum length up to the target
3039 // (end of loop) location.
3040 U_ASSERT(loopEndLoc
<= end
+1);
3041 if (forwardedLength
.elementAti(loopEndLoc
) > currentLen
) {
3042 forwardedLength
.setElementAt(currentLen
, loopEndLoc
);
3045 loc
+=3; // Skips over operands of CTR_INIT
3052 case URX_CTR_LOOP_NG
:
3054 // The jump is conditional, backwards only.
3059 // More loop ops. These state-save to themselves.
3060 // don't change the minimum match
3068 // Look-around. Scan forward until the matching look-ahead end,
3069 // without processing the look-around block. This is overly pessimistic.
3071 // Keep track of the nesting depth of look-around blocks. Boilerplate code for
3072 // lookahead contains two LA_END instructions, so count goes up by two
3073 // for each LA_START.
3074 int32_t depth
= (opType
== URX_LA_START
? 2: 1);
3077 op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
3078 if (URX_TYPE(op
) == URX_LA_START
) {
3081 if (URX_TYPE(op
) == URX_LB_START
) {
3084 if (URX_TYPE(op
) == URX_LA_END
|| URX_TYPE(op
)==URX_LBN_END
) {
3090 if (URX_TYPE(op
) == URX_STATE_SAVE
) {
3091 // Need this because neg lookahead blocks will FAIL to outside
3093 int32_t jmpDest
= URX_VAL(op
);
3094 if (jmpDest
> loc
) {
3095 if (currentLen
< forwardedLength
.elementAti(jmpDest
)) {
3096 forwardedLength
.setElementAt(currentLen
, jmpDest
);
3100 U_ASSERT(loc
<= end
);
3110 UPRV_UNREACHABLE
; // Shouldn't get here. These ops should be
3111 // consumed by the scan in URX_LA_START and LB_START
3119 // We have finished walking through the ops. Check whether some forward jump
3120 // propagated a shorter length to location end+1.
3121 if (forwardedLength
.elementAti(end
+1) < currentLen
) {
3122 currentLen
= forwardedLength
.elementAti(end
+1);
3126 fRXPat
->fInitialChars8
->init(fRXPat
->fInitialChars
);
3129 // Sort out what we should check for when looking for candidate match start positions.
3130 // In order of preference,
3131 // 1. Start of input text buffer.
3132 // 2. A literal string.
3133 // 3. Start of line in multi-line mode.
3134 // 4. A single literal character.
3135 // 5. A character from a set of characters.
3137 if (fRXPat
->fStartType
== START_START
) {
3138 // Match only at the start of an input text string.
3139 // start type is already set. We're done.
3140 } else if (numInitialStrings
== 1 && fRXPat
->fMinMatchLen
> 0) {
3141 // Match beginning only with a literal string.
3142 UChar32 c
= fRXPat
->fLiteralText
.char32At(fRXPat
->fInitialStringIdx
);
3143 U_ASSERT(fRXPat
->fInitialChars
->contains(c
));
3144 fRXPat
->fStartType
= START_STRING
;
3145 fRXPat
->fInitialChar
= c
;
3146 } else if (fRXPat
->fStartType
== START_LINE
) {
3147 // Match at start of line in Multi-Line mode.
3148 // Nothing to do here; everything is already set.
3149 } else if (fRXPat
->fMinMatchLen
== 0) {
3150 // Zero length match possible. We could start anywhere.
3151 fRXPat
->fStartType
= START_NO_INFO
;
3152 } else if (fRXPat
->fInitialChars
->size() == 1) {
3153 // All matches begin with the same char.
3154 fRXPat
->fStartType
= START_CHAR
;
3155 fRXPat
->fInitialChar
= fRXPat
->fInitialChars
->charAt(0);
3156 U_ASSERT(fRXPat
->fInitialChar
!= (UChar32
)-1);
3157 } else if (fRXPat
->fInitialChars
->contains((UChar32
)0, (UChar32
)0x10ffff) == FALSE
&&
3158 fRXPat
->fMinMatchLen
> 0) {
3159 // Matches start with a set of character smaller than the set of all chars.
3160 fRXPat
->fStartType
= START_SET
;
3162 // Matches can start with anything
3163 fRXPat
->fStartType
= START_NO_INFO
;
3171 //------------------------------------------------------------------------------
3173 // minMatchLength Calculate the length of the shortest string that could
3174 // match the specified pattern.
3175 // Length is in 16 bit code units, not code points.
3177 // The calculated length may not be exact. The returned
3178 // value may be shorter than the actual minimum; it must
3181 // start and end are the range of p-code operations to be
3182 // examined. The endpoints are included in the range.
3184 //------------------------------------------------------------------------------
3185 int32_t RegexCompile::minMatchLength(int32_t start
, int32_t end
) {
3186 if (U_FAILURE(*fStatus
)) {
3190 U_ASSERT(start
<= end
);
3191 U_ASSERT(end
< fRXPat
->fCompiledPat
->size());
3197 int32_t currentLen
= 0;
3200 // forwardedLength is a vector holding minimum-match-length values that
3201 // are propagated forward in the pattern by JMP or STATE_SAVE operations.
3202 // It must be one longer than the pattern being checked because some ops
3203 // will jmp to a end-of-block+1 location from within a block, and we must
3204 // count those when checking the block.
3205 UVector32
forwardedLength(end
+2, *fStatus
);
3206 forwardedLength
.setSize(end
+2);
3207 for (loc
=start
; loc
<=end
+1; loc
++) {
3208 forwardedLength
.setElementAt(INT32_MAX
, loc
);
3211 for (loc
= start
; loc
<=end
; loc
++) {
3212 op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
3213 opType
= URX_TYPE(op
);
3215 // The loop is advancing linearly through the pattern.
3216 // If the op we are now at was the destination of a branch in the pattern,
3217 // and that path has a shorter minimum length than the current accumulated value,
3218 // replace the current accumulated value.
3219 // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); // MinLength == INT32_MAX for some
3220 // no-match-possible cases.
3221 if (forwardedLength
.elementAti(loc
) < currentLen
) {
3222 currentLen
= forwardedLength
.elementAti(loc
);
3223 U_ASSERT(currentLen
>=0 && currentLen
< INT32_MAX
);
3227 // Ops that don't change the total length matched
3228 case URX_RESERVED_OP
:
3230 case URX_STRING_LEN
:
3232 case URX_START_CAPTURE
:
3233 case URX_END_CAPTURE
:
3234 case URX_BACKSLASH_B
:
3235 case URX_BACKSLASH_BU
:
3236 case URX_BACKSLASH_G
:
3237 case URX_BACKSLASH_Z
:
3243 case URX_RELOC_OPRND
:
3244 case URX_STO_INP_LOC
:
3246 case URX_CARET_M_UNIX
:
3247 case URX_BACKREF
: // BackRef. Must assume that it might be a zero length match
3250 case URX_STO_SP
: // Setup for atomic or possessive blocks. Doesn't change what can match.
3258 // Ops that match a minimum of one character (one or two 16 bit code units.)
3261 case URX_STATIC_SETREF
:
3262 case URX_STAT_SETREF_N
:
3264 case URX_BACKSLASH_D
:
3265 case URX_BACKSLASH_H
:
3266 case URX_BACKSLASH_R
:
3267 case URX_BACKSLASH_V
:
3269 case URX_BACKSLASH_X
: // Grahpeme Cluster. Minimum is 1, max unbounded.
3270 case URX_DOTANY_ALL
: // . matches one or two.
3272 case URX_DOTANY_UNIX
:
3273 currentLen
= safeIncrement(currentLen
, 1);
3278 loc
++; // URX_JMPX has an extra operand, ignored here,
3279 // otherwise processed identically to URX_JMP.
3283 int32_t jmpDest
= URX_VAL(op
);
3284 if (jmpDest
< loc
) {
3285 // Loop of some kind. Can safely ignore, the worst that will happen
3286 // is that we understate the true minimum length
3287 currentLen
= forwardedLength
.elementAti(loc
+1);
3289 // Forward jump. Propagate the current min length to the target loc of the jump.
3290 U_ASSERT(jmpDest
<= end
+1);
3291 if (forwardedLength
.elementAti(jmpDest
) > currentLen
) {
3292 forwardedLength
.setElementAt(currentLen
, jmpDest
);
3300 // Back-tracks are kind of like a branch, except that the min length was
3301 // propagated already, by the state save.
3302 currentLen
= forwardedLength
.elementAti(loc
+1);
3307 case URX_STATE_SAVE
:
3309 // State Save, for forward jumps, propagate the current minimum.
3310 // of the state save.
3311 int32_t jmpDest
= URX_VAL(op
);
3312 if (jmpDest
> loc
) {
3313 if (currentLen
< forwardedLength
.elementAti(jmpDest
)) {
3314 forwardedLength
.setElementAt(currentLen
, jmpDest
);
3324 int32_t stringLenOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
3325 currentLen
= safeIncrement(currentLen
, URX_VAL(stringLenOp
));
3333 // TODO: with full case folding, matching input text may be shorter than
3334 // the string we have here. More smarts could put some bounds on it.
3335 // Assume a min length of one for now. A min length of zero causes
3336 // optimization failures for a pattern like "string"+
3337 // currentLen += URX_VAL(stringLenOp);
3338 currentLen
= safeIncrement(currentLen
, 1);
3343 case URX_CTR_INIT_NG
:
3346 // If the min loop count == 0
3347 // move loc forwards to the end of the loop, skipping over the body.
3348 // If the min count is > 0,
3349 // continue normal processing of the body of the loop.
3350 int32_t loopEndLoc
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
+1);
3351 loopEndLoc
= URX_VAL(loopEndLoc
);
3352 int32_t minLoopCount
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
+2);
3353 if (minLoopCount
== 0) {
3356 loc
+=3; // Skips over operands of CTR_INIT
3363 case URX_CTR_LOOP_NG
:
3365 // The jump is conditional, backwards only.
3369 case URX_LOOP_DOT_I
:
3371 // More loop ops. These state-save to themselves.
3372 // don't change the minimum match - could match nothing at all.
3379 // Look-around. Scan forward until the matching look-ahead end,
3380 // without processing the look-around block. This is overly pessimistic for look-ahead,
3381 // it assumes that the look-ahead match might be zero-length.
3382 // TODO: Positive lookahead could recursively do the block, then continue
3383 // with the longer of the block or the value coming in. Ticket 6060
3384 int32_t depth
= (opType
== URX_LA_START
? 2: 1);;
3387 op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
3388 if (URX_TYPE(op
) == URX_LA_START
) {
3389 // The boilerplate for look-ahead includes two LA_END insturctions,
3390 // Depth will be decremented by each one when it is seen.
3393 if (URX_TYPE(op
) == URX_LB_START
) {
3396 if (URX_TYPE(op
) == URX_LA_END
) {
3402 if (URX_TYPE(op
)==URX_LBN_END
) {
3408 if (URX_TYPE(op
) == URX_STATE_SAVE
) {
3409 // Need this because neg lookahead blocks will FAIL to outside
3411 int32_t jmpDest
= URX_VAL(op
);
3412 if (jmpDest
> loc
) {
3413 if (currentLen
< forwardedLength
.elementAti(jmpDest
)) {
3414 forwardedLength
.setElementAt(currentLen
, jmpDest
);
3418 U_ASSERT(loc
<= end
);
3428 // Only come here if the matching URX_LA_START or URX_LB_START was not in the
3429 // range being sized, which happens when measuring size of look-behind blocks.
3438 // We have finished walking through the ops. Check whether some forward jump
3439 // propagated a shorter length to location end+1.
3440 if (forwardedLength
.elementAti(end
+1) < currentLen
) {
3441 currentLen
= forwardedLength
.elementAti(end
+1);
3442 U_ASSERT(currentLen
>=0 && currentLen
< INT32_MAX
);
3448 //------------------------------------------------------------------------------
3450 // maxMatchLength Calculate the length of the longest string that could
3451 // match the specified pattern.
3452 // Length is in 16 bit code units, not code points.
3454 // The calculated length may not be exact. The returned
3455 // value may be longer than the actual maximum; it must
3456 // never be shorter.
3458 //------------------------------------------------------------------------------
3459 int32_t RegexCompile::maxMatchLength(int32_t start
, int32_t end
) {
3460 if (U_FAILURE(*fStatus
)) {
3463 U_ASSERT(start
<= end
);
3464 U_ASSERT(end
< fRXPat
->fCompiledPat
->size());
3470 int32_t currentLen
= 0;
3471 UVector32
forwardedLength(end
+1, *fStatus
);
3472 forwardedLength
.setSize(end
+1);
3474 for (loc
=start
; loc
<=end
; loc
++) {
3475 forwardedLength
.setElementAt(0, loc
);
3478 for (loc
= start
; loc
<=end
; loc
++) {
3479 op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
3480 opType
= URX_TYPE(op
);
3482 // The loop is advancing linearly through the pattern.
3483 // If the op we are now at was the destination of a branch in the pattern,
3484 // and that path has a longer maximum length than the current accumulated value,
3485 // replace the current accumulated value.
3486 if (forwardedLength
.elementAti(loc
) > currentLen
) {
3487 currentLen
= forwardedLength
.elementAti(loc
);
3491 // Ops that don't change the total length matched
3492 case URX_RESERVED_OP
:
3494 case URX_STRING_LEN
:
3496 case URX_START_CAPTURE
:
3497 case URX_END_CAPTURE
:
3498 case URX_BACKSLASH_B
:
3499 case URX_BACKSLASH_BU
:
3500 case URX_BACKSLASH_G
:
3501 case URX_BACKSLASH_Z
:
3507 case URX_RELOC_OPRND
:
3508 case URX_STO_INP_LOC
:
3510 case URX_CARET_M_UNIX
:
3512 case URX_STO_SP
: // Setup for atomic or possessive blocks. Doesn't change what can match.
3522 // Ops that increase that cause an unbounded increase in the length
3523 // of a matched string, or that increase it a hard to characterize way.
3524 // Call the max length unbounded, and stop further checking.
3525 case URX_BACKREF
: // BackRef. Must assume that it might be a zero length match
3527 case URX_BACKSLASH_X
: // Grahpeme Cluster. Minimum is 1, max unbounded.
3528 currentLen
= INT32_MAX
;
3532 // Ops that match a max of one character (possibly two 16 bit code units.)
3534 case URX_STATIC_SETREF
:
3535 case URX_STAT_SETREF_N
:
3537 case URX_BACKSLASH_D
:
3538 case URX_BACKSLASH_H
:
3539 case URX_BACKSLASH_R
:
3540 case URX_BACKSLASH_V
:
3542 case URX_DOTANY_ALL
:
3544 case URX_DOTANY_UNIX
:
3545 currentLen
= safeIncrement(currentLen
, 2);
3548 // Single literal character. Increase current max length by one or two,
3549 // depending on whether the char is in the supplementary range.
3551 currentLen
= safeIncrement(currentLen
, 1);
3552 if (URX_VAL(op
) > 0x10000) {
3553 currentLen
= safeIncrement(currentLen
, 1);
3564 int32_t jmpDest
= URX_VAL(op
);
3565 if (jmpDest
< loc
) {
3566 // Loop of some kind. Max match length is unbounded.
3567 currentLen
= INT32_MAX
;
3569 // Forward jump. Propagate the current min length to the target loc of the jump.
3570 if (forwardedLength
.elementAti(jmpDest
) < currentLen
) {
3571 forwardedLength
.setElementAt(currentLen
, jmpDest
);
3579 // back-tracks are kind of like a branch, except that the max length was
3580 // propagated already, by the state save.
3581 currentLen
= forwardedLength
.elementAti(loc
+1);
3585 case URX_STATE_SAVE
:
3587 // State Save, for forward jumps, propagate the current minimum.
3588 // of the state save.
3589 // For backwards jumps, they create a loop, maximum
3590 // match length is unbounded.
3591 int32_t jmpDest
= URX_VAL(op
);
3592 if (jmpDest
> loc
) {
3593 if (currentLen
> forwardedLength
.elementAti(jmpDest
)) {
3594 forwardedLength
.setElementAt(currentLen
, jmpDest
);
3597 currentLen
= INT32_MAX
;
3608 int32_t stringLenOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
3609 currentLen
= safeIncrement(currentLen
, URX_VAL(stringLenOp
));
3614 // TODO: This code assumes that any user string that matches will be no longer
3615 // than our compiled string, with case insensitive matching.
3616 // Our compiled string has been case-folded already.
3618 // Any matching user string will have no more code points than our
3619 // compiled (folded) string. Folding may add code points, but
3622 // There is a potential problem if a supplemental code point
3623 // case-folds to a BMP code point. In this case our compiled string
3624 // could be shorter (in code units) than a matching user string.
3626 // At this time (Unicode 6.1) there are no such characters, and this case
3627 // is not being handled. A test, intltest regex/Bug9283, will fail if
3628 // any problematic characters are added to Unicode.
3630 // If this happens, we can make a set of the BMP chars that the
3631 // troublesome supplementals fold to, scan our string, and bump the
3632 // currentLen one extra for each that is found.
3636 int32_t stringLenOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
3637 currentLen
= safeIncrement(currentLen
, URX_VAL(stringLenOp
));
3642 case URX_CTR_INIT_NG
:
3643 // For Loops, recursively call this function on the pattern for the loop body,
3644 // then multiply the result by the maximum loop count.
3646 int32_t loopEndLoc
= URX_VAL(fRXPat
->fCompiledPat
->elementAti(loc
+1));
3647 if (loopEndLoc
== loc
+4) {
3648 // Loop has an empty body. No affect on max match length.
3649 // Continue processing with code after the loop end.
3654 int32_t maxLoopCount
= static_cast<int32_t>(fRXPat
->fCompiledPat
->elementAti(loc
+3));
3655 if (maxLoopCount
== -1) {
3656 // Unbounded Loop. No upper bound on match length.
3657 currentLen
= INT32_MAX
;
3661 U_ASSERT(loopEndLoc
>= loc
+4);
3662 int64_t blockLen
= maxMatchLength(loc
+4, loopEndLoc
-1); // Recursive call.
3663 int64_t updatedLen
= (int64_t)currentLen
+ blockLen
* maxLoopCount
;
3664 if (updatedLen
>= INT32_MAX
) {
3665 currentLen
= INT32_MAX
;
3668 currentLen
= (int32_t)updatedLen
;
3674 case URX_CTR_LOOP_NG
:
3675 // These opcodes will be skipped over by code for URX_CRT_INIT.
3676 // We shouldn't encounter them here.
3680 case URX_LOOP_DOT_I
:
3682 // For anything to do with loops, make the match length unbounded.
3683 currentLen
= INT32_MAX
;
3690 // Look-ahead. Just ignore, treat the look-ahead block as if
3691 // it were normal pattern. Gives a too-long match length,
3692 // but good enough for now.
3695 // End of look-ahead ops should always be consumed by the processing at
3696 // the URX_LA_START op.
3697 // UPRV_UNREACHABLE;
3701 // Look-behind. Scan forward until the matching look-around end,
3702 // without processing the look-behind block.
3706 op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
3707 if (URX_TYPE(op
) == URX_LA_START
|| URX_TYPE(op
) == URX_LB_START
) {
3710 if (URX_TYPE(op
) == URX_LA_END
|| URX_TYPE(op
)==URX_LBN_END
) {
3716 U_ASSERT(loc
< end
);
3726 if (currentLen
== INT32_MAX
) {
3727 // The maximum length is unbounded.
3728 // Stop further processing of the pattern.
3738 //------------------------------------------------------------------------------
3740 // stripNOPs Remove any NOP operations from the compiled pattern code.
3741 // Extra NOPs are inserted for some constructs during the initial
3742 // code generation to provide locations that may be patched later.
3743 // Many end up unneeded, and are removed by this function.
3745 // In order to minimize the number of passes through the pattern,
3746 // back-reference fixup is also performed here (adjusting
3747 // back-reference operands to point to the correct frame offsets).
3749 //------------------------------------------------------------------------------
3750 void RegexCompile::stripNOPs() {
3752 if (U_FAILURE(*fStatus
)) {
3756 int32_t end
= fRXPat
->fCompiledPat
->size();
3757 UVector32
deltas(end
, *fStatus
);
3759 // Make a first pass over the code, computing the amount that things
3760 // will be offset at each location in the original code.
3763 for (loc
=0; loc
<end
; loc
++) {
3764 deltas
.addElement(d
, *fStatus
);
3765 int32_t op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
3766 if (URX_TYPE(op
) == URX_NOP
) {
3771 UnicodeString caseStringBuffer
;
3773 // Make a second pass over the code, removing the NOPs by moving following
3774 // code up, and patching operands that refer to code locations that
3775 // are being moved. The array of offsets from the first step is used
3776 // to compute the new operand values.
3779 for (src
=0; src
<end
; src
++) {
3780 int32_t op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(src
);
3781 int32_t opType
= URX_TYPE(op
);
3786 case URX_STATE_SAVE
:
3789 case URX_CTR_LOOP_NG
:
3790 case URX_RELOC_OPRND
:
3794 // These are instructions with operands that refer to code locations.
3796 int32_t operandAddress
= URX_VAL(op
);
3797 U_ASSERT(operandAddress
>=0 && operandAddress
<deltas
.size());
3798 int32_t fixedOperandAddress
= operandAddress
- deltas
.elementAti(operandAddress
);
3799 op
= buildOp(opType
, fixedOperandAddress
);
3800 fRXPat
->fCompiledPat
->setElementAt(op
, dst
);
3808 int32_t where
= URX_VAL(op
);
3809 if (where
> fRXPat
->fGroupMap
->size()) {
3810 error(U_REGEX_INVALID_BACK_REF
);
3813 where
= fRXPat
->fGroupMap
->elementAti(where
-1);
3814 op
= buildOp(opType
, where
);
3815 fRXPat
->fCompiledPat
->setElementAt(op
, dst
);
3818 fRXPat
->fNeedsAltInput
= TRUE
;
3821 case URX_RESERVED_OP
:
3822 case URX_RESERVED_OP_N
:
3827 case URX_STRING_LEN
:
3828 case URX_START_CAPTURE
:
3829 case URX_END_CAPTURE
:
3830 case URX_STATIC_SETREF
:
3831 case URX_STAT_SETREF_N
:
3835 case URX_BACKSLASH_B
:
3836 case URX_BACKSLASH_BU
:
3837 case URX_BACKSLASH_G
:
3838 case URX_BACKSLASH_X
:
3839 case URX_BACKSLASH_Z
:
3840 case URX_DOTANY_ALL
:
3841 case URX_BACKSLASH_D
:
3845 case URX_CTR_INIT_NG
:
3846 case URX_DOTANY_UNIX
:
3849 case URX_STO_INP_LOC
:
3856 case URX_CARET_M_UNIX
:
3863 case URX_LOOP_DOT_I
:
3867 case URX_BACKSLASH_H
:
3868 case URX_BACKSLASH_R
:
3869 case URX_BACKSLASH_V
:
3870 // These instructions are unaltered by the relocation.
3871 fRXPat
->fCompiledPat
->setElementAt(op
, dst
);
3876 // Some op is unaccounted for.
3881 fRXPat
->fCompiledPat
->setSize(dst
);
3887 //------------------------------------------------------------------------------
3889 // Error Report a rule parse error.
3890 // Only report it if no previous error has been recorded.
3892 //------------------------------------------------------------------------------
3893 void RegexCompile::error(UErrorCode e
) {
3894 if (U_SUCCESS(*fStatus
) || e
== U_MEMORY_ALLOCATION_ERROR
) {
3896 // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
3897 // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
3898 // int64_t. If the values of the latter are out of range for the former,
3899 // set them to the appropriate "field not supported" values.
3900 if (fLineNum
> 0x7FFFFFFF) {
3901 fParseErr
->line
= 0;
3902 fParseErr
->offset
= -1;
3903 } else if (fCharNum
> 0x7FFFFFFF) {
3904 fParseErr
->line
= (int32_t)fLineNum
;
3905 fParseErr
->offset
= -1;
3907 fParseErr
->line
= (int32_t)fLineNum
;
3908 fParseErr
->offset
= (int32_t)fCharNum
;
3911 UErrorCode status
= U_ZERO_ERROR
; // throwaway status for extracting context
3913 // Fill in the context.
3914 // Note: extractBetween() pins supplied indicies to the string bounds.
3915 uprv_memset(fParseErr
->preContext
, 0, sizeof(fParseErr
->preContext
));
3916 uprv_memset(fParseErr
->postContext
, 0, sizeof(fParseErr
->postContext
));
3917 utext_extract(fRXPat
->fPattern
, fScanIndex
-U_PARSE_CONTEXT_LEN
+1, fScanIndex
, fParseErr
->preContext
, U_PARSE_CONTEXT_LEN
, &status
);
3918 utext_extract(fRXPat
->fPattern
, fScanIndex
, fScanIndex
+U_PARSE_CONTEXT_LEN
-1, fParseErr
->postContext
, U_PARSE_CONTEXT_LEN
, &status
);
3924 // Assorted Unicode character constants.
3925 // Numeric because there is no portable way to enter them as literals.
3928 static const UChar chCR
= 0x0d; // New lines, for terminating comments.
3929 static const UChar chLF
= 0x0a; // Line Feed
3930 static const UChar chPound
= 0x23; // '#', introduces a comment.
3931 static const UChar chDigit0
= 0x30; // '0'
3932 static const UChar chDigit7
= 0x37; // '9'
3933 static const UChar chColon
= 0x3A; // ':'
3934 static const UChar chE
= 0x45; // 'E'
3935 static const UChar chQ
= 0x51; // 'Q'
3936 //static const UChar chN = 0x4E; // 'N'
3937 static const UChar chP
= 0x50; // 'P'
3938 static const UChar chBackSlash
= 0x5c; // '\' introduces a char escape
3939 //static const UChar chLBracket = 0x5b; // '['
3940 static const UChar chRBracket
= 0x5d; // ']'
3941 static const UChar chUp
= 0x5e; // '^'
3942 static const UChar chLowerP
= 0x70;
3943 static const UChar chLBrace
= 0x7b; // '{'
3944 static const UChar chRBrace
= 0x7d; // '}'
3945 static const UChar chNEL
= 0x85; // NEL newline variant
3946 static const UChar chLS
= 0x2028; // Unicode Line Separator
3949 //------------------------------------------------------------------------------
3951 // nextCharLL Low Level Next Char from the regex pattern.
3952 // Get a char from the string, keep track of input position
3953 // for error reporting.
3955 //------------------------------------------------------------------------------
3956 UChar32
RegexCompile::nextCharLL() {
3959 if (fPeekChar
!= -1) {
3965 // assume we're already in the right place
3966 ch
= UTEXT_NEXT32(fRXPat
->fPattern
);
3967 if (ch
== U_SENTINEL
) {
3974 (ch
== chLF
&& fLastChar
!= chCR
)) {
3975 // Character is starting a new line. Bump up the line number, and
3976 // reset the column to 0.
3981 // Character is not starting a new line. Except in the case of a
3982 // LF following a CR, increment the column position.
3991 //------------------------------------------------------------------------------
3993 // peekCharLL Low Level Character Scanning, sneak a peek at the next
3994 // character without actually getting it.
3996 //------------------------------------------------------------------------------
3997 UChar32
RegexCompile::peekCharLL() {
3998 if (fPeekChar
== -1) {
3999 fPeekChar
= nextCharLL();
4005 //------------------------------------------------------------------------------
4007 // nextChar for pattern scanning. At this level, we handle stripping
4008 // out comments and processing some backslash character escapes.
4009 // The rest of the pattern grammar is handled at the next level up.
4011 //------------------------------------------------------------------------------
4012 void RegexCompile::nextChar(RegexPatternChar
&c
) {
4014 fScanIndex
= UTEXT_GETNATIVEINDEX(fRXPat
->fPattern
);
4015 c
.fChar
= nextCharLL();
4020 if ((c
.fChar
==chBackSlash
&& peekCharLL()==chE
&& ((fModeFlags
& UREGEX_LITERAL
) == 0)) ||
4021 c
.fChar
== (UChar32
)-1) {
4022 fQuoteMode
= FALSE
; // Exit quote mode,
4023 nextCharLL(); // discard the E
4024 // nextChar(c); // recurse to get the real next char
4025 goto tailRecursion
; // Note: fuzz testing produced testcases that
4026 // resulted in stack overflow here.
4029 else if (fInBackslashQuote
) {
4030 // The current character immediately follows a '\'
4031 // Don't check for any further escapes, just return it as-is.
4032 // Don't set c.fQuoted, because that would prevent the state machine from
4033 // dispatching on the character.
4034 fInBackslashQuote
= FALSE
;
4038 // We are not in a \Q quoted region \E of the source.
4040 if (fModeFlags
& UREGEX_COMMENTS
) {
4042 // We are in free-spacing and comments mode.
4043 // Scan through any white space and comments, until we
4044 // reach a significant character or the end of inut.
4046 if (c
.fChar
== (UChar32
)-1) {
4047 break; // End of Input
4049 if (c
.fChar
== chPound
&& fEOLComments
== TRUE
) {
4050 // Start of a comment. Consume the rest of it, until EOF or a new line
4052 c
.fChar
= nextCharLL();
4053 if (c
.fChar
== (UChar32
)-1 || // EOF
4062 // TODO: check what Java & Perl do with non-ASCII white spaces. Ticket 6061.
4063 if (PatternProps::isWhiteSpace(c
.fChar
) == FALSE
) {
4066 c
.fChar
= nextCharLL();
4071 // check for backslash escaped characters.
4073 if (c
.fChar
== chBackSlash
) {
4074 int64_t pos
= UTEXT_GETNATIVEINDEX(fRXPat
->fPattern
);
4075 if (RegexStaticSets::gStaticSets
->fUnescapeCharSet
.contains(peekCharLL())) {
4077 // A '\' sequence that is handled by ICU's standard unescapeAt function.
4078 // Includes \uxxxx, \n, \r, many others.
4079 // Return the single equivalent character.
4081 nextCharLL(); // get & discard the peeked char.
4084 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat
->fPattern
, fPatternLength
)) {
4085 int32_t endIndex
= (int32_t)pos
;
4086 c
.fChar
= u_unescapeAt(uregex_ucstr_unescape_charAt
, &endIndex
, (int32_t)fPatternLength
, (void *)fRXPat
->fPattern
->chunkContents
);
4088 if (endIndex
== pos
) {
4089 error(U_REGEX_BAD_ESCAPE_SEQUENCE
);
4091 fCharNum
+= endIndex
- pos
;
4092 UTEXT_SETNATIVEINDEX(fRXPat
->fPattern
, endIndex
);
4095 struct URegexUTextUnescapeCharContext context
= U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat
->fPattern
);
4097 UTEXT_SETNATIVEINDEX(fRXPat
->fPattern
, pos
);
4098 c
.fChar
= u_unescapeAt(uregex_utext_unescape_charAt
, &offset
, INT32_MAX
, &context
);
4101 error(U_REGEX_BAD_ESCAPE_SEQUENCE
);
4102 } else if (context
.lastOffset
== offset
) {
4103 UTEXT_PREVIOUS32(fRXPat
->fPattern
);
4104 } else if (context
.lastOffset
!= offset
-1) {
4105 utext_moveIndex32(fRXPat
->fPattern
, offset
- context
.lastOffset
- 1);
4110 else if (peekCharLL() == chDigit0
) {
4111 // Octal Escape, using Java Regexp Conventions
4112 // which are \0 followed by 1-3 octal digits.
4113 // Different from ICU Unescape handling of Octal, which does not
4114 // require the leading 0.
4115 // Java also has the convention of only consuming 2 octal digits if
4116 // the three digit number would be > 0xff
4119 nextCharLL(); // Consume the initial 0.
4121 for (index
=0; index
<3; index
++) {
4122 int32_t ch
= peekCharLL();
4123 if (ch
<chDigit0
|| ch
>chDigit7
) {
4125 // \0 is not followed by any octal digits.
4126 error(U_REGEX_BAD_ESCAPE_SEQUENCE
);
4132 if (c
.fChar
<= 255) {
4135 // The last digit made the number too big. Forget we saw it.
4141 else if (peekCharLL() == chQ
) {
4142 // "\Q" enter quote mode, which will continue until "\E"
4144 nextCharLL(); // discard the 'Q'.
4145 // nextChar(c); // recurse to get the real next char.
4146 goto tailRecursion
; // Note: fuzz testing produced test cases that
4147 // resulted in stack overflow here.
4151 // We are in a '\' escape that will be handled by the state table scanner.
4152 // Just return the backslash, but remember that the following char is to
4153 // be taken literally.
4154 fInBackslashQuote
= TRUE
;
4159 // re-enable # to end-of-line comments, in case they were disabled.
4160 // They are disabled by the parser upon seeing '(?', but this lasts for
4161 // the fetching of the next character only.
4162 fEOLComments
= TRUE
;
4164 // putc(c.fChar, stdout);
4169 //------------------------------------------------------------------------------
4172 // Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
4174 // The scan position will be at the 'N'. On return
4175 // the scan position should be just after the '}'
4177 // Return the UChar32
4179 //------------------------------------------------------------------------------
4180 UChar32
RegexCompile::scanNamedChar() {
4181 if (U_FAILURE(*fStatus
)) {
4186 if (fC
.fChar
!= chLBrace
) {
4187 error(U_REGEX_PROPERTY_SYNTAX
);
4191 UnicodeString charName
;
4194 if (fC
.fChar
== chRBrace
) {
4197 if (fC
.fChar
== -1) {
4198 error(U_REGEX_PROPERTY_SYNTAX
);
4201 charName
.append(fC
.fChar
);
4205 if (!uprv_isInvariantUString(charName
.getBuffer(), charName
.length()) ||
4206 (uint32_t)charName
.length()>=sizeof(name
)) {
4207 // All Unicode character names have only invariant characters.
4208 // The API to get a character, given a name, accepts only char *, forcing us to convert,
4209 // which requires this error check
4210 error(U_REGEX_PROPERTY_SYNTAX
);
4213 charName
.extract(0, charName
.length(), name
, sizeof(name
), US_INV
);
4215 UChar32 theChar
= u_charFromName(U_UNICODE_CHAR_NAME
, name
, fStatus
);
4216 if (U_FAILURE(*fStatus
)) {
4217 error(U_REGEX_PROPERTY_SYNTAX
);
4220 nextChar(fC
); // Continue overall regex pattern processing with char after the '}'
4224 //------------------------------------------------------------------------------
4226 // scanProp Construct a UnicodeSet from the text at the current scan
4227 // position, which will be of the form \p{whaterver}
4229 // The scan position will be at the 'p' or 'P'. On return
4230 // the scan position should be just after the '}'
4232 // Return a UnicodeSet, constructed from the \P pattern,
4233 // or NULL if the pattern is invalid.
4235 //------------------------------------------------------------------------------
4236 UnicodeSet
*RegexCompile::scanProp() {
4237 UnicodeSet
*uset
= NULL
;
4239 if (U_FAILURE(*fStatus
)) {
4242 (void)chLowerP
; // Suppress compiler unused variable warning.
4243 U_ASSERT(fC
.fChar
== chLowerP
|| fC
.fChar
== chP
);
4244 UBool negated
= (fC
.fChar
== chP
);
4246 UnicodeString propertyName
;
4248 if (fC
.fChar
!= chLBrace
) {
4249 error(U_REGEX_PROPERTY_SYNTAX
);
4254 if (fC
.fChar
== chRBrace
) {
4257 if (fC
.fChar
== -1) {
4258 // Hit the end of the input string without finding the closing '}'
4259 error(U_REGEX_PROPERTY_SYNTAX
);
4262 propertyName
.append(fC
.fChar
);
4264 uset
= createSetForProperty(propertyName
, negated
);
4265 nextChar(fC
); // Move input scan to position following the closing '}'
4269 //------------------------------------------------------------------------------
4271 // scanPosixProp Construct a UnicodeSet from the text at the current scan
4272 // position, which is expected be of the form [:property expression:]
4274 // The scan position will be at the opening ':'. On return
4275 // the scan position must be on the closing ']'
4277 // Return a UnicodeSet constructed from the pattern,
4278 // or NULL if this is not a valid POSIX-style set expression.
4279 // If not a property expression, restore the initial scan position
4280 // (to the opening ':')
4282 // Note: the opening '[:' is not sufficient to guarantee that
4283 // this is a [:property:] expression.
4284 // [:'+=,] is a perfectly good ordinary set expression that
4285 // happens to include ':' as one of its characters.
4287 //------------------------------------------------------------------------------
4288 UnicodeSet
*RegexCompile::scanPosixProp() {
4289 UnicodeSet
*uset
= NULL
;
4291 if (U_FAILURE(*fStatus
)) {
4295 U_ASSERT(fC
.fChar
== chColon
);
4297 // Save the scanner state.
4298 // TODO: move this into the scanner, with the state encapsulated in some way. Ticket 6062
4299 int64_t savedScanIndex
= fScanIndex
;
4300 int64_t savedNextIndex
= UTEXT_GETNATIVEINDEX(fRXPat
->fPattern
);
4301 UBool savedQuoteMode
= fQuoteMode
;
4302 UBool savedInBackslashQuote
= fInBackslashQuote
;
4303 UBool savedEOLComments
= fEOLComments
;
4304 int64_t savedLineNum
= fLineNum
;
4305 int64_t savedCharNum
= fCharNum
;
4306 UChar32 savedLastChar
= fLastChar
;
4307 UChar32 savedPeekChar
= fPeekChar
;
4308 RegexPatternChar savedfC
= fC
;
4310 // Scan for a closing ]. A little tricky because there are some perverse
4311 // edge cases possible. "[:abc\Qdef:] \E]" is a valid non-property expression,
4312 // ending on the second closing ].
4314 UnicodeString propName
;
4315 UBool negated
= FALSE
;
4317 // Check for and consume the '^' in a negated POSIX property, e.g. [:^Letter:]
4319 if (fC
.fChar
== chUp
) {
4324 // Scan for the closing ":]", collecting the property name along the way.
4325 UBool sawPropSetTerminator
= FALSE
;
4327 propName
.append(fC
.fChar
);
4329 if (fC
.fQuoted
|| fC
.fChar
== -1) {
4330 // Escaped characters or end of input - either says this isn't a [:Property:]
4333 if (fC
.fChar
== chColon
) {
4335 if (fC
.fChar
== chRBracket
) {
4336 sawPropSetTerminator
= TRUE
;
4342 if (sawPropSetTerminator
) {
4343 uset
= createSetForProperty(propName
, negated
);
4348 // Restore the original scan position.
4349 // The main scanner will retry the input as a normal set expression,
4350 // not a [:Property:] expression.
4351 fScanIndex
= savedScanIndex
;
4352 fQuoteMode
= savedQuoteMode
;
4353 fInBackslashQuote
= savedInBackslashQuote
;
4354 fEOLComments
= savedEOLComments
;
4355 fLineNum
= savedLineNum
;
4356 fCharNum
= savedCharNum
;
4357 fLastChar
= savedLastChar
;
4358 fPeekChar
= savedPeekChar
;
4360 UTEXT_SETNATIVEINDEX(fRXPat
->fPattern
, savedNextIndex
);
4365 static inline void addIdentifierIgnorable(UnicodeSet
*set
, UErrorCode
& ec
) {
4366 set
->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
4367 addCategory(set
, U_GC_CF_MASK
, ec
);
4371 // Create a Unicode Set from a Unicode Property expression.
4372 // This is common code underlying both \p{...} ane [:...:] expressions.
4373 // Includes trying the Java "properties" that aren't supported as
4374 // normal ICU UnicodeSet properties
4376 UnicodeSet
*RegexCompile::createSetForProperty(const UnicodeString
&propName
, UBool negated
) {
4378 if (U_FAILURE(*fStatus
)) {
4381 LocalPointer
<UnicodeSet
> set
;
4382 UErrorCode status
= U_ZERO_ERROR
;
4384 do { // non-loop, exists to allow breaks from the block.
4386 // First try the property as we received it
4388 UnicodeString setExpr
;
4389 uint32_t usetFlags
= 0;
4390 setExpr
.append(u
"[\\p{", -1);
4391 setExpr
.append(propName
);
4392 setExpr
.append(u
"}]", -1);
4393 if (fModeFlags
& UREGEX_CASE_INSENSITIVE
) {
4394 usetFlags
|= USET_CASE_INSENSITIVE
;
4396 set
.adoptInsteadAndCheckErrorCode(new UnicodeSet(setExpr
, usetFlags
, NULL
, status
), status
);
4397 if (U_SUCCESS(status
) || status
== U_MEMORY_ALLOCATION_ERROR
) {
4402 // The incoming property wasn't directly recognized by ICU.
4404 // Check [:word:] and [:all:]. These are not recognized as a properties by ICU UnicodeSet.
4405 // Java accepts 'word' with mixed case.
4406 // Java accepts 'all' only in all lower case.
4408 status
= U_ZERO_ERROR
;
4409 if (propName
.caseCompare(u
"word", -1, 0) == 0) {
4410 set
.adoptInsteadAndCheckErrorCode(new UnicodeSet(*(fRXPat
->fStaticSets
[URX_ISWORD_SET
])), status
);
4413 if (propName
.compare(u
"all", -1) == 0) {
4414 set
.adoptInsteadAndCheckErrorCode(new UnicodeSet(0, 0x10ffff), status
);
4419 // Do Java InBlock expressions
4421 UnicodeString mPropName
= propName
;
4422 if (mPropName
.startsWith(u
"In", 2) && mPropName
.length() >= 3) {
4423 status
= U_ZERO_ERROR
;
4424 set
.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status
);
4425 if (U_FAILURE(status
)) {
4428 UnicodeString
blockName(mPropName
, 2); // Property with the leading "In" removed.
4429 set
->applyPropertyAlias(UnicodeString(u
"Block"), blockName
, status
);
4433 // Check for the Java form "IsBooleanPropertyValue", which we will recast
4434 // as "BooleanPropertyValue". The property value can be either a
4435 // a General Category or a Script Name.
4437 if (propName
.startsWith(u
"Is", 2) && propName
.length()>=3) {
4438 mPropName
.remove(0, 2); // Strip the "Is"
4439 if (mPropName
.indexOf(u
'=') >= 0) {
4440 // Reject any "Is..." property expression containing an '=', that is,
4441 // any non-binary property expression.
4442 status
= U_REGEX_PROPERTY_SYNTAX
;
4446 if (mPropName
.caseCompare(u
"assigned", -1, 0) == 0) {
4447 mPropName
.setTo(u
"unassigned", -1);
4449 } else if (mPropName
.caseCompare(u
"TitleCase", -1, 0) == 0) {
4450 mPropName
.setTo(u
"Titlecase_Letter", -1);
4453 mPropName
.insert(0, u
"[\\p{", -1);
4454 mPropName
.append(u
"}]", -1);
4455 set
.adoptInsteadAndCheckErrorCode(new UnicodeSet(mPropName
, *fStatus
), status
);
4457 if (U_SUCCESS(status
) && !set
->isEmpty() && (usetFlags
& USET_CASE_INSENSITIVE
)) {
4458 set
->closeOver(USET_CASE_INSENSITIVE
);
4464 if (propName
.startsWith(u
"java", -1)) {
4465 status
= U_ZERO_ERROR
;
4466 set
.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status
);
4467 if (U_FAILURE(status
)) {
4471 // Try the various Java specific properties.
4472 // These all begin with "java"
4474 if (propName
.compare(u
"javaDefined", -1) == 0) {
4475 addCategory(set
.getAlias(), U_GC_CN_MASK
, status
);
4478 else if (propName
.compare(u
"javaDigit", -1) == 0) {
4479 addCategory(set
.getAlias(), U_GC_ND_MASK
, status
);
4481 else if (propName
.compare(u
"javaIdentifierIgnorable", -1) == 0) {
4482 addIdentifierIgnorable(set
.getAlias(), status
);
4484 else if (propName
.compare(u
"javaISOControl", -1) == 0) {
4485 set
->add(0, 0x1F).add(0x7F, 0x9F);
4487 else if (propName
.compare(u
"javaJavaIdentifierPart", -1) == 0) {
4488 addCategory(set
.getAlias(), U_GC_L_MASK
, status
);
4489 addCategory(set
.getAlias(), U_GC_SC_MASK
, status
);
4490 addCategory(set
.getAlias(), U_GC_PC_MASK
, status
);
4491 addCategory(set
.getAlias(), U_GC_ND_MASK
, status
);
4492 addCategory(set
.getAlias(), U_GC_NL_MASK
, status
);
4493 addCategory(set
.getAlias(), U_GC_MC_MASK
, status
);
4494 addCategory(set
.getAlias(), U_GC_MN_MASK
, status
);
4495 addIdentifierIgnorable(set
.getAlias(), status
);
4497 else if (propName
.compare(u
"javaJavaIdentifierStart", -1) == 0) {
4498 addCategory(set
.getAlias(), U_GC_L_MASK
, status
);
4499 addCategory(set
.getAlias(), U_GC_NL_MASK
, status
);
4500 addCategory(set
.getAlias(), U_GC_SC_MASK
, status
);
4501 addCategory(set
.getAlias(), U_GC_PC_MASK
, status
);
4503 else if (propName
.compare(u
"javaLetter", -1) == 0) {
4504 addCategory(set
.getAlias(), U_GC_L_MASK
, status
);
4506 else if (propName
.compare(u
"javaLetterOrDigit", -1) == 0) {
4507 addCategory(set
.getAlias(), U_GC_L_MASK
, status
);
4508 addCategory(set
.getAlias(), U_GC_ND_MASK
, status
);
4510 else if (propName
.compare(u
"javaLowerCase", -1) == 0) {
4511 addCategory(set
.getAlias(), U_GC_LL_MASK
, status
);
4513 else if (propName
.compare(u
"javaMirrored", -1) == 0) {
4514 set
->applyIntPropertyValue(UCHAR_BIDI_MIRRORED
, 1, status
);
4516 else if (propName
.compare(u
"javaSpaceChar", -1) == 0) {
4517 addCategory(set
.getAlias(), U_GC_Z_MASK
, status
);
4519 else if (propName
.compare(u
"javaSupplementaryCodePoint", -1) == 0) {
4520 set
->add(0x10000, UnicodeSet::MAX_VALUE
);
4522 else if (propName
.compare(u
"javaTitleCase", -1) == 0) {
4523 addCategory(set
.getAlias(), U_GC_LT_MASK
, status
);
4525 else if (propName
.compare(u
"javaUnicodeIdentifierStart", -1) == 0) {
4526 addCategory(set
.getAlias(), U_GC_L_MASK
, status
);
4527 addCategory(set
.getAlias(), U_GC_NL_MASK
, status
);
4529 else if (propName
.compare(u
"javaUnicodeIdentifierPart", -1) == 0) {
4530 addCategory(set
.getAlias(), U_GC_L_MASK
, status
);
4531 addCategory(set
.getAlias(), U_GC_PC_MASK
, status
);
4532 addCategory(set
.getAlias(), U_GC_ND_MASK
, status
);
4533 addCategory(set
.getAlias(), U_GC_NL_MASK
, status
);
4534 addCategory(set
.getAlias(), U_GC_MC_MASK
, status
);
4535 addCategory(set
.getAlias(), U_GC_MN_MASK
, status
);
4536 addIdentifierIgnorable(set
.getAlias(), status
);
4538 else if (propName
.compare(u
"javaUpperCase", -1) == 0) {
4539 addCategory(set
.getAlias(), U_GC_LU_MASK
, status
);
4541 else if (propName
.compare(u
"javaValidCodePoint", -1) == 0) {
4542 set
->add(0, UnicodeSet::MAX_VALUE
);
4544 else if (propName
.compare(u
"javaWhitespace", -1) == 0) {
4545 addCategory(set
.getAlias(), U_GC_Z_MASK
, status
);
4546 set
->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
4547 set
->add(9, 0x0d).add(0x1c, 0x1f);
4549 status
= U_REGEX_PROPERTY_SYNTAX
;
4552 if (U_SUCCESS(status
) && !set
->isEmpty() && (usetFlags
& USET_CASE_INSENSITIVE
)) {
4553 set
->closeOver(USET_CASE_INSENSITIVE
);
4558 // Unrecognized property. ICU didn't like it as it was, and none of the Java compatibility
4559 // extensions matched it.
4560 status
= U_REGEX_PROPERTY_SYNTAX
;
4561 } while (false); // End of do loop block. Code above breaks out of the block on success or hard failure.
4563 if (U_SUCCESS(status
)) {
4564 U_ASSERT(set
.isValid());
4568 return set
.orphan();
4570 if (status
== U_ILLEGAL_ARGUMENT_ERROR
) {
4571 status
= U_REGEX_PROPERTY_SYNTAX
;
4580 // SetEval Part of the evaluation of [set expressions].
4581 // Perform any pending (stacked) operations with precedence
4582 // equal or greater to that of the next operator encountered
4583 // in the expression.
4585 void RegexCompile::setEval(int32_t nextOp
) {
4586 UnicodeSet
*rightOperand
= NULL
;
4587 UnicodeSet
*leftOperand
= NULL
;
4589 U_ASSERT(fSetOpStack
.empty()==FALSE
);
4590 int32_t pendingSetOperation
= fSetOpStack
.peeki();
4591 if ((pendingSetOperation
&0xffff0000) < (nextOp
&0xffff0000)) {
4595 U_ASSERT(fSetStack
.empty() == FALSE
);
4596 rightOperand
= (UnicodeSet
*)fSetStack
.peek();
4597 switch (pendingSetOperation
) {
4599 rightOperand
->complement();
4602 // TODO: need a simple close function. Ticket 6065
4603 rightOperand
->closeOver(USET_CASE_INSENSITIVE
);
4604 rightOperand
->removeAllStrings();
4606 case setDifference1
:
4607 case setDifference2
:
4609 leftOperand
= (UnicodeSet
*)fSetStack
.peek();
4610 leftOperand
->removeAll(*rightOperand
);
4611 delete rightOperand
;
4613 case setIntersection1
:
4614 case setIntersection2
:
4616 leftOperand
= (UnicodeSet
*)fSetStack
.peek();
4617 leftOperand
->retainAll(*rightOperand
);
4618 delete rightOperand
;
4622 leftOperand
= (UnicodeSet
*)fSetStack
.peek();
4623 leftOperand
->addAll(*rightOperand
);
4624 delete rightOperand
;
4632 void RegexCompile::setPushOp(int32_t op
) {
4634 fSetOpStack
.push(op
, *fStatus
);
4635 fSetStack
.push(new UnicodeSet(), *fStatus
);
4639 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS