1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
6 // Copyright (C) 2002-2016 International Business Machines Corporation and others.
7 // All Rights Reserved.
9 // This file contains the ICU regular expression compiler, which is responsible
10 // for processing a regular expression pattern into the compiled form that
11 // is used by the match finding engine.
14 #include "unicode/utypes.h"
16 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
18 #include "unicode/ustring.h"
19 #include "unicode/unistr.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/uchriter.h"
23 #include "unicode/parsepos.h"
24 #include "unicode/parseerr.h"
25 #include "unicode/regex.h"
26 #include "unicode/utf.h"
27 #include "unicode/utf16.h"
28 #include "patternprops.h"
39 #include "regexcst.h" // Contains state table for the regex pattern parser.
40 // generated by a Perl script.
50 //------------------------------------------------------------------------------
54 //------------------------------------------------------------------------------
55 RegexCompile::RegexCompile(RegexPattern
*rxp
, UErrorCode
&status
) :
56 fParenStack(status
), fSetStack(status
), fSetOpStack(status
)
58 // Lazy init of all shared global sets (needed for init()'s empty text)
59 RegexStaticSets::initGlobals(&status
);
70 fInBackslashQuote
= FALSE
;
71 fModeFlags
= fRXPat
->fFlags
| 0x80000000;
75 fMatchCloseParen
= -1;
77 fLastSetLiteral
= U_SENTINEL
;
79 if (U_SUCCESS(status
) && U_FAILURE(rxp
->fDeferredStatus
)) {
80 status
= rxp
->fDeferredStatus
;
84 static const UChar chAmp
= 0x26; // '&'
85 static const UChar chDash
= 0x2d; // '-'
88 //------------------------------------------------------------------------------
92 //------------------------------------------------------------------------------
93 RegexCompile::~RegexCompile() {
94 delete fCaptureName
; // Normally will be NULL, but can exist if pattern
95 // compilation stops with a syntax error.
98 static inline void addCategory(UnicodeSet
*set
, int32_t value
, UErrorCode
& ec
) {
99 set
->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK
, value
, ec
));
102 //------------------------------------------------------------------------------
104 // Compile regex pattern. The state machine for rexexp pattern parsing is here.
105 // The state tables are hand-written in the file regexcst.txt,
106 // and converted to the form used here by a perl
107 // script regexcst.pl
109 //------------------------------------------------------------------------------
110 void RegexCompile::compile(
111 const UnicodeString
&pat
, // Source pat to be compiled.
112 UParseError
&pp
, // Error position info
113 UErrorCode
&e
) // Error Code
115 fRXPat
->fPatternString
= new UnicodeString(pat
);
116 UText patternText
= UTEXT_INITIALIZER
;
117 utext_openConstUnicodeString(&patternText
, fRXPat
->fPatternString
, &e
);
120 compile(&patternText
, pp
, e
);
121 utext_close(&patternText
);
126 // compile, UText mode
127 // All the work is actually done here.
129 void RegexCompile::compile(
130 UText
*pat
, // Source pat to be compiled.
131 UParseError
&pp
, // Error position info
132 UErrorCode
&e
) // Error Code
137 fStack
[fStackPtr
] = 0;
139 if (U_FAILURE(*fStatus
)) {
143 // There should be no pattern stuff in the RegexPattern object. They can not be reused.
144 U_ASSERT(fRXPat
->fPattern
== NULL
|| utext_nativeLength(fRXPat
->fPattern
) == 0);
146 // Prepare the RegexPattern object to receive the compiled pattern.
147 fRXPat
->fPattern
= utext_clone(fRXPat
->fPattern
, pat
, FALSE
, TRUE
, fStatus
);
148 if (U_FAILURE(*fStatus
)) {
151 fRXPat
->fStaticSets
= RegexStaticSets::gStaticSets
->fPropSets
;
152 fRXPat
->fStaticSets8
= RegexStaticSets::gStaticSets
->fPropSets8
;
155 // Initialize the pattern scanning state machine
156 fPatternLength
= utext_nativeLength(pat
);
158 const RegexTableEl
*tableEl
;
160 // UREGEX_LITERAL force entire pattern to be treated as a literal string.
161 if (fModeFlags
& UREGEX_LITERAL
) {
165 nextChar(fC
); // Fetch the first char from the pattern string.
168 // Main loop for the regex pattern parsing state machine.
169 // Runs once per state transition.
170 // Each time through optionally performs, depending on the state table,
171 // - an advance to the the next pattern char
172 // - an action to be performed.
173 // - pushing or popping a state to/from the local state return stack.
174 // file regexcst.txt is the source for the state table. The logic behind
175 // recongizing the pattern syntax is there, not here.
178 // Bail out if anything has gone wrong.
179 // Regex pattern parsing stops on the first error encountered.
180 if (U_FAILURE(*fStatus
)) {
184 U_ASSERT(state
!= 0);
186 // Find the state table element that matches the input char from the pattern, or the
187 // class of the input character. Start with the first table row for this
188 // state, then linearly scan forward until we find a row that matches the
189 // character. The last row for each state always matches all characters, so
190 // the search will stop there, if not before.
192 tableEl
= &gRuleParseStateTable
[state
];
193 REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d) state=%s ",
194 fC
.fChar
, fLineNum
, fCharNum
, RegexStateNames
[state
]));
196 for (;;) { // loop through table rows belonging to this state, looking for one
197 // that matches the current input char.
198 REGEX_SCAN_DEBUG_PRINTF(("."));
199 if (tableEl
->fCharClass
< 127 && fC
.fQuoted
== FALSE
&& tableEl
->fCharClass
== fC
.fChar
) {
200 // Table row specified an individual character, not a set, and
201 // the input character is not quoted, and
202 // the input character matched it.
205 if (tableEl
->fCharClass
== 255) {
206 // Table row specified default, match anything character class.
209 if (tableEl
->fCharClass
== 254 && fC
.fQuoted
) {
210 // Table row specified "quoted" and the char was quoted.
213 if (tableEl
->fCharClass
== 253 && fC
.fChar
== (UChar32
)-1) {
214 // Table row specified eof and we hit eof on the input.
218 if (tableEl
->fCharClass
>= 128 && tableEl
->fCharClass
< 240 && // Table specs a char class &&
219 fC
.fQuoted
== FALSE
&& // char is not escaped &&
220 fC
.fChar
!= (UChar32
)-1) { // char is not EOF
221 U_ASSERT(tableEl
->fCharClass
<= 137);
222 if (RegexStaticSets::gStaticSets
->fRuleSets
[tableEl
->fCharClass
-128].contains(fC
.fChar
)) {
223 // Table row specified a character class, or set of characters,
224 // and the current char matches it.
229 // No match on this row, advance to the next row for this state,
232 REGEX_SCAN_DEBUG_PRINTF(("\n"));
235 // We've found the row of the state table that matches the current input
236 // character from the rules string.
237 // Perform any action specified by this row in the state table.
238 if (doParseActions(tableEl
->fAction
) == FALSE
) {
239 // Break out of the state machine loop if the
240 // the action signalled some kind of error, or
241 // the action was to exit, occurs on normal end-of-rules-input.
245 if (tableEl
->fPushState
!= 0) {
247 if (fStackPtr
>= kStackSize
) {
248 error(U_REGEX_INTERNAL_ERROR
);
249 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
252 fStack
[fStackPtr
] = tableEl
->fPushState
;
256 // NextChar. This is where characters are actually fetched from the pattern.
257 // Happens under control of the 'n' tag in the state table.
259 if (tableEl
->fNextChar
) {
263 // Get the next state from the table entry, or from the
264 // state stack if the next state was specified as "pop".
265 if (tableEl
->fNextState
!= 255) {
266 state
= tableEl
->fNextState
;
268 state
= fStack
[fStackPtr
];
271 // state stack underflow
272 // This will occur if the user pattern has mis-matched parentheses,
273 // with extra close parens.
276 error(U_REGEX_MISMATCHED_PAREN
);
282 if (U_FAILURE(*fStatus
)) {
283 // Bail out if the pattern had errors.
284 // Set stack cleanup: a successful compile would have left it empty,
285 // but errors can leave temporary sets hanging around.
286 while (!fSetStack
.empty()) {
287 delete (UnicodeSet
*)fSetStack
.pop();
293 // The pattern has now been read and processed, and the compiled code generated.
297 // The pattern's fFrameSize so far has accumulated the requirements for
298 // storage for capture parentheses, counters, etc. that are encountered
299 // in the pattern. Add space for the two variables that are always
300 // present in the saved state: the input string position (int64_t) and
301 // the position in the compiled pattern.
303 allocateStackData(RESTACKFRAME_HDRCOUNT
);
306 // Optimization pass 1: NOPs, back-references, and case-folding
311 // Get bounds for the minimum and maximum length of a string that this
312 // pattern can match. Used to avoid looking for matches in strings that
315 fRXPat
->fMinMatchLen
= minMatchLength(3, fRXPat
->fCompiledPat
->size()-1);
318 // Optimization pass 2: match start type
323 // Set up fast latin-1 range sets
325 int32_t numSets
= fRXPat
->fSets
->size();
326 fRXPat
->fSets8
= new Regex8BitSet
[numSets
];
327 // Null pointer check.
328 if (fRXPat
->fSets8
== NULL
) {
329 e
= *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
333 for (i
=0; i
<numSets
; i
++) {
334 UnicodeSet
*s
= (UnicodeSet
*)fRXPat
->fSets
->elementAt(i
);
335 fRXPat
->fSets8
[i
].init(s
);
344 //------------------------------------------------------------------------------
346 // doParseAction Do some action during regex pattern parsing.
347 // Called by the parse state machine.
349 // Generation of the match engine PCode happens here, or
350 // in functions called from the parse actions defined here.
353 //------------------------------------------------------------------------------
354 UBool
RegexCompile::doParseActions(int32_t action
)
356 UBool returnVal
= TRUE
;
358 switch ((Regex_PatternParseAction
)action
) {
361 // Start of pattern compiles to:
362 //0 SAVE 2 Fall back to position of FAIL
364 //2 FAIL Stop if we ever reach here.
365 //3 NOP Dummy, so start of pattern looks the same as
366 // the start of an ( grouping.
367 //4 NOP Resreved, will be replaced by a save if there are
368 // OR | operators at the top level
369 appendOp(URX_STATE_SAVE
, 2);
370 appendOp(URX_JMP
, 3);
371 appendOp(URX_FAIL
, 0);
373 // Standard open nonCapture paren action emits the two NOPs and
374 // sets up the paren stack frame.
375 doParseActions(doOpenNonCaptureParen
);
379 // We've scanned to the end of the pattern
380 // The end of pattern compiles to:
382 // which will stop the runtime match engine.
383 // Encountering end of pattern also behaves like a close paren,
384 // and forces fixups of the State Save at the beginning of the compiled pattern
385 // and of any OR operations at the top level.
388 if (fParenStack
.size() > 0) {
389 // Missing close paren in pattern.
390 error(U_REGEX_MISMATCHED_PAREN
);
393 // add the END operation to the compiled pattern.
394 appendOp(URX_END
, 0);
396 // Terminate the pattern compilation state machine.
403 // Scanning a '|', as in (A|B)
405 // Generate code for any pending literals preceding the '|'
408 // Insert a SAVE operation at the start of the pattern section preceding
409 // this OR at this level. This SAVE will branch the match forward
410 // to the right hand side of the OR in the event that the left hand
411 // side fails to match and backtracks. Locate the position for the
412 // save from the location on the top of the parentheses stack.
413 int32_t savePosition
= fParenStack
.popi();
414 int32_t op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(savePosition
);
415 U_ASSERT(URX_TYPE(op
) == URX_NOP
); // original contents of reserved location
416 op
= buildOp(URX_STATE_SAVE
, fRXPat
->fCompiledPat
->size()+1);
417 fRXPat
->fCompiledPat
->setElementAt(op
, savePosition
);
419 // Append an JMP operation into the compiled pattern. The operand for
420 // the JMP will eventually be the location following the ')' for the
421 // group. This will be patched in later, when the ')' is encountered.
422 appendOp(URX_JMP
, 0);
424 // Push the position of the newly added JMP op onto the parentheses stack.
425 // This registers if for fixup when this block's close paren is encountered.
426 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
);
428 // Append a NOP to the compiled pattern. This is the slot reserved
429 // for a SAVE in the event that there is yet another '|' following
431 appendOp(URX_NOP
, 0);
432 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
);
437 case doBeginNamedCapture
:
438 // Scanning (?<letter.
439 // The first letter of the name will come through again under doConinueNamedCapture.
440 fCaptureName
= new UnicodeString();
441 if (fCaptureName
== NULL
) {
442 error(U_MEMORY_ALLOCATION_ERROR
);
446 case doContinueNamedCapture
:
447 fCaptureName
->append(fC
.fChar
);
450 case doBadNamedCapture
:
451 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
454 case doOpenCaptureParen
:
455 // Open Capturing Paren, possibly named.
457 // - NOP, which later may be replaced by a save-state if the
458 // parenthesized group gets a * quantifier, followed by
459 // - START_CAPTURE n where n is stack frame offset to the capture group variables.
460 // - NOP, which may later be replaced by a save-state if there
461 // is an '|' alternation within the parens.
463 // Each capture group gets three slots in the save stack frame:
464 // 0: Capture Group start position (in input string being matched.)
465 // 1: Capture Group end position.
466 // 2: Start of Match-in-progress.
467 // The first two locations are for a completed capture group, and are
468 // referred to by back references and the like.
469 // The third location stores the capture start position when an START_CAPTURE is
470 // encountered. This will be promoted to a completed capture when (and if) the corresponding
471 // END_CAPTURE is encountered.
474 appendOp(URX_NOP
, 0);
475 int32_t varsLoc
= allocateStackData(3); // Reserve three slots in match stack frame.
476 appendOp(URX_START_CAPTURE
, varsLoc
);
477 appendOp(URX_NOP
, 0);
479 // On the Parentheses stack, start a new frame and add the postions
480 // of the two NOPs. Depending on what follows in the pattern, the
481 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
482 // address of the end of the parenthesized group.
483 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
484 fParenStack
.push(capturing
, *fStatus
); // Frame type.
485 fParenStack
.push(fRXPat
->fCompiledPat
->size()-3, *fStatus
); // The first NOP location
486 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP loc
488 // Save the mapping from group number to stack frame variable position.
489 fRXPat
->fGroupMap
->addElement(varsLoc
, *fStatus
);
491 // If this is a named capture group, add the name->group number mapping.
492 if (fCaptureName
!= NULL
) {
493 if (!fRXPat
->initNamedCaptureMap()) {
494 if (U_SUCCESS(*fStatus
)) {
495 error(fRXPat
->fDeferredStatus
);
499 int32_t groupNumber
= fRXPat
->fGroupMap
->size();
500 int32_t previousMapping
= uhash_puti(fRXPat
->fNamedCaptureMap
, fCaptureName
, groupNumber
, fStatus
);
501 fCaptureName
= NULL
; // hash table takes ownership of the name (key) string.
502 if (previousMapping
> 0 && U_SUCCESS(*fStatus
)) {
503 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
509 case doOpenNonCaptureParen
:
510 // Open non-caputuring (grouping only) Paren.
512 // - NOP, which later may be replaced by a save-state if the
513 // parenthesized group gets a * quantifier, followed by
514 // - NOP, which may later be replaced by a save-state if there
515 // is an '|' alternation within the parens.
518 appendOp(URX_NOP
, 0);
519 appendOp(URX_NOP
, 0);
521 // On the Parentheses stack, start a new frame and add the postions
523 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
524 fParenStack
.push(plain
, *fStatus
); // Begin a new frame.
525 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The first NOP location
526 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP loc
531 case doOpenAtomicParen
:
532 // Open Atomic Paren. (?>
534 // - NOP, which later may be replaced if the parenthesized group
535 // has a quantifier, followed by
536 // - STO_SP save state stack position, so it can be restored at the ")"
537 // - NOP, which may later be replaced by a save-state if there
538 // is an '|' alternation within the parens.
541 appendOp(URX_NOP
, 0);
542 int32_t varLoc
= allocateData(1); // Reserve a data location for saving the state stack ptr.
543 appendOp(URX_STO_SP
, varLoc
);
544 appendOp(URX_NOP
, 0);
546 // On the Parentheses stack, start a new frame and add the postions
547 // of the two NOPs. Depending on what follows in the pattern, the
548 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
549 // address of the end of the parenthesized group.
550 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
551 fParenStack
.push(atomic
, *fStatus
); // Frame type.
552 fParenStack
.push(fRXPat
->fCompiledPat
->size()-3, *fStatus
); // The first NOP
553 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP
558 case doOpenLookAhead
:
559 // Positive Look-ahead (?= stuff )
561 // Note: Addition of transparent input regions, with the need to
562 // restore the original regions when failing out of a lookahead
563 // block, complicated this sequence. Some conbined opcodes
564 // might make sense - or might not, lookahead aren't that common.
566 // Caution: min match length optimization knows about this
567 // sequence; don't change without making updates there too.
570 // 1 LA_START dataLoc Saves SP, Input Pos, Active input region.
571 // 2. STATE_SAVE 4 on failure of lookahead, goto 4
572 // 3 JMP 6 continue ...
574 // 4. LA_END Look Ahead failed. Restore regions.
575 // 5. BACKTRACK and back track again.
577 // 6. NOP reserved for use by quantifiers on the block.
578 // Look-ahead can't have quantifiers, but paren stack
579 // compile time conventions require the slot anyhow.
580 // 7. NOP may be replaced if there is are '|' ops in the block.
581 // 8. code for parenthesized stuff.
584 // Four data slots are reserved, for saving state on entry to the look-around
585 // 0: stack pointer on entry.
586 // 1: input position on entry.
587 // 2: fActiveStart, the active bounds start on entry.
588 // 3: fActiveLimit, the active bounds limit on entry.
591 int32_t dataLoc
= allocateData(4);
592 appendOp(URX_LA_START
, dataLoc
);
593 appendOp(URX_STATE_SAVE
, fRXPat
->fCompiledPat
->size()+ 2);
594 appendOp(URX_JMP
, fRXPat
->fCompiledPat
->size()+ 3);
595 appendOp(URX_LA_END
, dataLoc
);
596 appendOp(URX_BACKTRACK
, 0);
597 appendOp(URX_NOP
, 0);
598 appendOp(URX_NOP
, 0);
600 // On the Parentheses stack, start a new frame and add the postions
602 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
603 fParenStack
.push(lookAhead
, *fStatus
); // Frame type.
604 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The first NOP location
605 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP location
609 case doOpenLookAheadNeg
:
610 // Negated Lookahead. (?! stuff )
612 // 1. LA_START dataloc
613 // 2. SAVE_STATE 7 // Fail within look-ahead block restores to this state,
614 // // which continues with the match.
615 // 3. NOP // Std. Open Paren sequence, for possible '|'
616 // 4. code for parenthesized stuff.
617 // 5. LA_END // Cut back stack, remove saved state from step 2.
618 // 6. BACKTRACK // code in block succeeded, so neg. lookahead fails.
619 // 7. END_LA // Restore match region, in case look-ahead was using
620 // an alternate (transparent) region.
621 // Four data slots are reserved, for saving state on entry to the look-around
622 // 0: stack pointer on entry.
623 // 1: input position on entry.
624 // 2: fActiveStart, the active bounds start on entry.
625 // 3: fActiveLimit, the active bounds limit on entry.
628 int32_t dataLoc
= allocateData(4);
629 appendOp(URX_LA_START
, dataLoc
);
630 appendOp(URX_STATE_SAVE
, 0); // dest address will be patched later.
631 appendOp(URX_NOP
, 0);
633 // On the Parentheses stack, start a new frame and add the postions
634 // of the StateSave and NOP.
635 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
636 fParenStack
.push(negLookAhead
, *fStatus
); // Frame type
637 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The STATE_SAVE location
638 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP location
640 // Instructions #5 - #7 will be added when the ')' is encountered.
644 case doOpenLookBehind
:
646 // Compile a (?<= look-behind open paren.
649 // 0 URX_LB_START dataLoc
650 // 1 URX_LB_CONT dataLoc
653 // 4 URX_NOP Standard '(' boilerplate.
654 // 5 URX_NOP Reserved slot for use with '|' ops within (block).
655 // 6 <code for LookBehind expression>
656 // 7 URX_LB_END dataLoc # Check match len, restore input len
657 // 8 URX_LA_END dataLoc # Restore stack, input pos
659 // Allocate a block of matcher data, to contain (when running a match)
660 // 0: Stack ptr on entry
661 // 1: Input Index on entry
662 // 2: fActiveStart, the active bounds start on entry.
663 // 3: fActiveLimit, the active bounds limit on entry.
664 // 4: Start index of match current match attempt.
665 // The first four items must match the layout of data for LA_START / LA_END
667 // Generate match code for any pending literals.
670 // Allocate data space
671 int32_t dataLoc
= allocateData(5);
674 appendOp(URX_LB_START
, dataLoc
);
677 appendOp(URX_LB_CONT
, dataLoc
);
678 appendOp(URX_RESERVED_OP
, 0); // MinMatchLength. To be filled later.
679 appendOp(URX_RESERVED_OP
, 0); // MaxMatchLength. To be filled later.
682 appendOp(URX_NOP
, 0);
683 appendOp(URX_NOP
, 0);
685 // On the Parentheses stack, start a new frame and add the postions
686 // of the URX_LB_CONT and the NOP.
687 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
688 fParenStack
.push(lookBehind
, *fStatus
); // Frame type
689 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The first NOP location
690 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The 2nd NOP location
692 // The final two instructions will be added when the ')' is encountered.
697 case doOpenLookBehindNeg
:
699 // Compile a (?<! negated look-behind open paren.
702 // 0 URX_LB_START dataLoc # Save entry stack, input len
703 // 1 URX_LBN_CONT dataLoc # Iterate possible match positions
707 // 5 URX_NOP Standard '(' boilerplate.
708 // 6 URX_NOP Reserved slot for use with '|' ops within (block).
709 // 7 <code for LookBehind expression>
710 // 8 URX_LBN_END dataLoc # Check match len, cause a FAIL
713 // Allocate a block of matcher data, to contain (when running a match)
714 // 0: Stack ptr on entry
715 // 1: Input Index on entry
716 // 2: fActiveStart, the active bounds start on entry.
717 // 3: fActiveLimit, the active bounds limit on entry.
718 // 4: Start index of match current match attempt.
719 // The first four items must match the layout of data for LA_START / LA_END
721 // Generate match code for any pending literals.
724 // Allocate data space
725 int32_t dataLoc
= allocateData(5);
728 appendOp(URX_LB_START
, dataLoc
);
731 appendOp(URX_LBN_CONT
, dataLoc
);
732 appendOp(URX_RESERVED_OP
, 0); // MinMatchLength. To be filled later.
733 appendOp(URX_RESERVED_OP
, 0); // MaxMatchLength. To be filled later.
734 appendOp(URX_RESERVED_OP
, 0); // Continue Loc. To be filled later.
737 appendOp(URX_NOP
, 0);
738 appendOp(URX_NOP
, 0);
740 // On the Parentheses stack, start a new frame and add the postions
741 // of the URX_LB_CONT and the NOP.
742 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
743 fParenStack
.push(lookBehindN
, *fStatus
); // Frame type
744 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The first NOP location
745 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The 2nd NOP location
747 // The final two instructions will be added when the ')' is encountered.
751 case doConditionalExpr
:
752 // Conditionals such as (?(1)a:b)
754 // Perl inline-condtionals. (?{perl code}a|b) We're not perl, no way to do them.
755 error(U_REGEX_UNIMPLEMENTED
);
761 if (fParenStack
.size() <= 0) {
762 // Extra close paren, or missing open paren.
763 error(U_REGEX_MISMATCHED_PAREN
);
771 case doBadOpenParenType
:
773 error(U_REGEX_RULE_SYNTAX
);
777 case doMismatchedParenErr
:
778 error(U_REGEX_MISMATCHED_PAREN
);
782 // Normal '+' compiles to
783 // 1. stuff to be repeated (already built)
787 // Or, if the item to be repeated can match a zero length string,
788 // 1. STO_INP_LOC data-loc
789 // 2. body of stuff to be repeated
794 // Or, if the item to be repeated is simple
795 // 1. Item to be repeated.
796 // 2. LOOP_SR_I set number (assuming repeated item is a set ref)
797 // 3. LOOP_C stack location
799 int32_t topLoc
= blockTopLoc(FALSE
); // location of item #1
802 // Check for simple constructs, which may get special optimized code.
803 if (topLoc
== fRXPat
->fCompiledPat
->size() - 1) {
804 int32_t repeatedOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(topLoc
);
806 if (URX_TYPE(repeatedOp
) == URX_SETREF
) {
807 // Emit optimized code for [char set]+
808 appendOp(URX_LOOP_SR_I
, URX_VAL(repeatedOp
));
809 frameLoc
= allocateStackData(1);
810 appendOp(URX_LOOP_C
, frameLoc
);
814 if (URX_TYPE(repeatedOp
) == URX_DOTANY
||
815 URX_TYPE(repeatedOp
) == URX_DOTANY_ALL
||
816 URX_TYPE(repeatedOp
) == URX_DOTANY_UNIX
) {
817 // Emit Optimized code for .+ operations.
818 int32_t loopOpI
= buildOp(URX_LOOP_DOT_I
, 0);
819 if (URX_TYPE(repeatedOp
) == URX_DOTANY_ALL
) {
820 // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
823 if (fModeFlags
& UREGEX_UNIX_LINES
) {
827 frameLoc
= allocateStackData(1);
828 appendOp(URX_LOOP_C
, frameLoc
);
836 // Check for minimum match length of zero, which requires
837 // extra loop-breaking code.
838 if (minMatchLength(topLoc
, fRXPat
->fCompiledPat
->size()-1) == 0) {
839 // Zero length match is possible.
840 // Emit the code sequence that can handle it.
842 frameLoc
= allocateStackData(1);
844 int32_t op
= buildOp(URX_STO_INP_LOC
, frameLoc
);
845 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
);
847 appendOp(URX_JMP_SAV_X
, topLoc
+1);
849 // Simpler code when the repeated body must match something non-empty
850 appendOp(URX_JMP_SAV
, topLoc
);
856 // Non-greedy '+?' compiles to
857 // 1. stuff to be repeated (already built)
861 int32_t topLoc
= blockTopLoc(FALSE
);
862 appendOp(URX_STATE_SAVE
, topLoc
);
868 // Normal (greedy) ? quantifier.
871 // 2. body of optional block
873 // Insert the state save into the compiled pattern, and we're done.
875 int32_t saveStateLoc
= blockTopLoc(TRUE
);
876 int32_t saveStateOp
= buildOp(URX_STATE_SAVE
, fRXPat
->fCompiledPat
->size());
877 fRXPat
->fCompiledPat
->setElementAt(saveStateOp
, saveStateLoc
);
882 // Non-greedy ?? quantifier
885 // 2. body of optional block
889 // This code is less than ideal, with two jmps instead of one, because we can only
890 // insert one instruction at the top of the block being iterated.
892 int32_t jmp1_loc
= blockTopLoc(TRUE
);
893 int32_t jmp2_loc
= fRXPat
->fCompiledPat
->size();
895 int32_t jmp1_op
= buildOp(URX_JMP
, jmp2_loc
+1);
896 fRXPat
->fCompiledPat
->setElementAt(jmp1_op
, jmp1_loc
);
898 appendOp(URX_JMP
, jmp2_loc
+2);
900 appendOp(URX_STATE_SAVE
, jmp1_loc
+1);
906 // Normal (greedy) * quantifier.
909 // 2. body of stuff being iterated over
913 // Or, if the body is a simple [Set],
914 // 1. LOOP_SR_I set number
915 // 2. LOOP_C stack location
918 // Or if this is a .*
919 // 1. LOOP_DOT_I (. matches all mode flag)
920 // 2. LOOP_C stack location
922 // Or, if the body can match a zero-length string, to inhibit infinite loops,
924 // 2. STO_INP_LOC data-loc
929 // location of item #1, the STATE_SAVE
930 int32_t topLoc
= blockTopLoc(FALSE
);
931 int32_t dataLoc
= -1;
933 // Check for simple *, where the construct being repeated
934 // compiled to single opcode, and might be optimizable.
935 if (topLoc
== fRXPat
->fCompiledPat
->size() - 1) {
936 int32_t repeatedOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(topLoc
);
938 if (URX_TYPE(repeatedOp
) == URX_SETREF
) {
939 // Emit optimized code for a [char set]*
940 int32_t loopOpI
= buildOp(URX_LOOP_SR_I
, URX_VAL(repeatedOp
));
941 fRXPat
->fCompiledPat
->setElementAt(loopOpI
, topLoc
);
942 dataLoc
= allocateStackData(1);
943 appendOp(URX_LOOP_C
, dataLoc
);
947 if (URX_TYPE(repeatedOp
) == URX_DOTANY
||
948 URX_TYPE(repeatedOp
) == URX_DOTANY_ALL
||
949 URX_TYPE(repeatedOp
) == URX_DOTANY_UNIX
) {
950 // Emit Optimized code for .* operations.
951 int32_t loopOpI
= buildOp(URX_LOOP_DOT_I
, 0);
952 if (URX_TYPE(repeatedOp
) == URX_DOTANY_ALL
) {
953 // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
956 if ((fModeFlags
& UREGEX_UNIX_LINES
) != 0) {
959 fRXPat
->fCompiledPat
->setElementAt(loopOpI
, topLoc
);
960 dataLoc
= allocateStackData(1);
961 appendOp(URX_LOOP_C
, dataLoc
);
966 // Emit general case code for this *
967 // The optimizations did not apply.
969 int32_t saveStateLoc
= blockTopLoc(TRUE
);
970 int32_t jmpOp
= buildOp(URX_JMP_SAV
, saveStateLoc
+1);
972 // Check for minimum match length of zero, which requires
973 // extra loop-breaking code.
974 if (minMatchLength(saveStateLoc
, fRXPat
->fCompiledPat
->size()-1) == 0) {
975 insertOp(saveStateLoc
);
976 dataLoc
= allocateStackData(1);
978 int32_t op
= buildOp(URX_STO_INP_LOC
, dataLoc
);
979 fRXPat
->fCompiledPat
->setElementAt(op
, saveStateLoc
+1);
980 jmpOp
= buildOp(URX_JMP_SAV_X
, saveStateLoc
+2);
983 // Locate the position in the compiled pattern where the match will continue
984 // after completing the *. (4 or 5 in the comment above)
985 int32_t continueLoc
= fRXPat
->fCompiledPat
->size()+1;
987 // Put together the save state op and store it into the compiled code.
988 int32_t saveStateOp
= buildOp(URX_STATE_SAVE
, continueLoc
);
989 fRXPat
->fCompiledPat
->setElementAt(saveStateOp
, saveStateLoc
);
991 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
997 // Non-greedy *? quantifier
1000 // 2. body of stuff being iterated over
1004 int32_t jmpLoc
= blockTopLoc(TRUE
); // loc 1.
1005 int32_t saveLoc
= fRXPat
->fCompiledPat
->size(); // loc 3.
1006 int32_t jmpOp
= buildOp(URX_JMP
, saveLoc
);
1007 fRXPat
->fCompiledPat
->setElementAt(jmpOp
, jmpLoc
);
1008 appendOp(URX_STATE_SAVE
, jmpLoc
+1);
1013 case doIntervalInit
:
1014 // The '{' opening an interval quantifier was just scanned.
1015 // Init the counter varaiables that will accumulate the values as the digits
1018 fIntervalUpper
= -1;
1021 case doIntevalLowerDigit
:
1022 // Scanned a digit from the lower value of an {lower,upper} interval
1024 int32_t digitValue
= u_charDigitValue(fC
.fChar
);
1025 U_ASSERT(digitValue
>= 0);
1026 int64_t val
= (int64_t)fIntervalLow
*10 + digitValue
;
1027 if (val
> INT32_MAX
) {
1028 error(U_REGEX_NUMBER_TOO_BIG
);
1030 fIntervalLow
= (int32_t)val
;
1035 case doIntervalUpperDigit
:
1036 // Scanned a digit from the upper value of an {lower,upper} interval
1038 if (fIntervalUpper
< 0) {
1041 int32_t digitValue
= u_charDigitValue(fC
.fChar
);
1042 U_ASSERT(digitValue
>= 0);
1043 int64_t val
= (int64_t)fIntervalUpper
*10 + digitValue
;
1044 if (val
> INT32_MAX
) {
1045 error(U_REGEX_NUMBER_TOO_BIG
);
1047 fIntervalUpper
= (int32_t)val
;
1052 case doIntervalSame
:
1053 // Scanned a single value interval like {27}. Upper = Lower.
1054 fIntervalUpper
= fIntervalLow
;
1058 // Finished scanning a normal {lower,upper} interval. Generate the code for it.
1059 if (compileInlineInterval() == FALSE
) {
1060 compileInterval(URX_CTR_INIT
, URX_CTR_LOOP
);
1064 case doPossessiveInterval
:
1065 // Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it.
1067 // Remember the loc for the top of the block being looped over.
1068 // (Can not reserve a slot in the compiled pattern at this time, because
1069 // compileInterval needs to reserve also, and blockTopLoc can only reserve
1071 int32_t topLoc
= blockTopLoc(FALSE
);
1073 // Produce normal looping code.
1074 compileInterval(URX_CTR_INIT
, URX_CTR_LOOP
);
1076 // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
1077 // just as if the loop was inclosed in atomic parentheses.
1079 // First the STO_SP before the start of the loop
1082 int32_t varLoc
= allocateData(1); // Reserve a data location for saving the
1083 int32_t op
= buildOp(URX_STO_SP
, varLoc
);
1084 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
);
1086 int32_t loopOp
= (int32_t)fRXPat
->fCompiledPat
->popi();
1087 U_ASSERT(URX_TYPE(loopOp
) == URX_CTR_LOOP
&& URX_VAL(loopOp
) == topLoc
);
1088 loopOp
++; // point LoopOp after the just-inserted STO_SP
1089 fRXPat
->fCompiledPat
->push(loopOp
, *fStatus
);
1091 // Then the LD_SP after the end of the loop
1092 appendOp(URX_LD_SP
, varLoc
);
1098 // Finished scanning a non-greedy {lower,upper}? interval. Generate the code for it.
1099 compileInterval(URX_CTR_INIT_NG
, URX_CTR_LOOP_NG
);
1102 case doIntervalError
:
1103 error(U_REGEX_BAD_INTERVAL
);
1107 // We've just scanned a "normal" character from the pattern,
1108 literalChar(fC
.fChar
);
1112 case doEscapedLiteralChar
:
1113 // We've just scanned an backslashed escaped character with no
1114 // special meaning. It represents itself.
1115 if ((fModeFlags
& UREGEX_ERROR_ON_UNKNOWN_ESCAPES
) != 0 &&
1116 ((fC
.fChar
>= 0x41 && fC
.fChar
<= 0x5A) || // in [A-Z]
1117 (fC
.fChar
>= 0x61 && fC
.fChar
<= 0x7a))) { // in [a-z]
1118 error(U_REGEX_BAD_ESCAPE_SEQUENCE
);
1120 literalChar(fC
.fChar
);
1125 // scanned a ".", match any single character.
1128 if (fModeFlags
& UREGEX_DOTALL
) {
1129 appendOp(URX_DOTANY_ALL
, 0);
1130 } else if (fModeFlags
& UREGEX_UNIX_LINES
) {
1131 appendOp(URX_DOTANY_UNIX
, 0);
1133 appendOp(URX_DOTANY
, 0);
1141 if ( (fModeFlags
& UREGEX_MULTILINE
) == 0 && (fModeFlags
& UREGEX_UNIX_LINES
) == 0) {
1142 appendOp(URX_CARET
, 0);
1143 } else if ((fModeFlags
& UREGEX_MULTILINE
) != 0 && (fModeFlags
& UREGEX_UNIX_LINES
) == 0) {
1144 appendOp(URX_CARET_M
, 0);
1145 } else if ((fModeFlags
& UREGEX_MULTILINE
) == 0 && (fModeFlags
& UREGEX_UNIX_LINES
) != 0) {
1146 appendOp(URX_CARET
, 0); // Only testing true start of input.
1147 } else if ((fModeFlags
& UREGEX_MULTILINE
) != 0 && (fModeFlags
& UREGEX_UNIX_LINES
) != 0) {
1148 appendOp(URX_CARET_M_UNIX
, 0);
1156 if ( (fModeFlags
& UREGEX_MULTILINE
) == 0 && (fModeFlags
& UREGEX_UNIX_LINES
) == 0) {
1157 appendOp(URX_DOLLAR
, 0);
1158 } else if ((fModeFlags
& UREGEX_MULTILINE
) != 0 && (fModeFlags
& UREGEX_UNIX_LINES
) == 0) {
1159 appendOp(URX_DOLLAR_M
, 0);
1160 } else if ((fModeFlags
& UREGEX_MULTILINE
) == 0 && (fModeFlags
& UREGEX_UNIX_LINES
) != 0) {
1161 appendOp(URX_DOLLAR_D
, 0);
1162 } else if ((fModeFlags
& UREGEX_MULTILINE
) != 0 && (fModeFlags
& UREGEX_UNIX_LINES
) != 0) {
1163 appendOp(URX_DOLLAR_MD
, 0);
1170 appendOp(URX_CARET
, 0);
1175 #if UCONFIG_NO_BREAK_ITERATION==1
1176 if (fModeFlags
& UREGEX_UWORD
) {
1177 error(U_UNSUPPORTED_ERROR
);
1181 int32_t op
= (fModeFlags
& UREGEX_UWORD
)? URX_BACKSLASH_BU
: URX_BACKSLASH_B
;
1188 #if UCONFIG_NO_BREAK_ITERATION==1
1189 if (fModeFlags
& UREGEX_UWORD
) {
1190 error(U_UNSUPPORTED_ERROR
);
1194 int32_t op
= (fModeFlags
& UREGEX_UWORD
)? URX_BACKSLASH_BU
: URX_BACKSLASH_B
;
1201 appendOp(URX_BACKSLASH_D
, 1);
1206 appendOp(URX_BACKSLASH_D
, 0);
1211 appendOp(URX_BACKSLASH_G
, 0);
1216 appendOp(URX_BACKSLASH_H
, 1);
1221 appendOp(URX_BACKSLASH_H
, 0);
1226 appendOp(URX_BACKSLASH_R
, 0);
1231 appendOp(URX_STAT_SETREF_N
, URX_ISSPACE_SET
);
1236 appendOp(URX_STATIC_SETREF
, URX_ISSPACE_SET
);
1241 appendOp(URX_BACKSLASH_V
, 1);
1246 appendOp(URX_BACKSLASH_V
, 0);
1251 appendOp(URX_STAT_SETREF_N
, URX_ISWORD_SET
);
1256 appendOp(URX_STATIC_SETREF
, URX_ISWORD_SET
);
1261 appendOp(URX_BACKSLASH_X
, 0);
1267 appendOp(URX_DOLLAR
, 0);
1272 appendOp(URX_BACKSLASH_Z
, 0);
1276 error(U_REGEX_BAD_ESCAPE_SEQUENCE
);
1287 UnicodeSet
*theSet
= scanProp();
1294 UChar32 c
= scanNamedChar();
1301 // BackReference. Somewhat unusual in that the front-end can not completely parse
1302 // the regular expression, because the number of digits to be consumed
1303 // depends on the number of capture groups that have been defined. So
1304 // we have to do it here instead.
1306 int32_t numCaptureGroups
= fRXPat
->fGroupMap
->size();
1307 int32_t groupNum
= 0;
1308 UChar32 c
= fC
.fChar
;
1311 // Loop once per digit, for max allowed number of digits in a back reference.
1312 int32_t digit
= u_charDigitValue(c
);
1313 groupNum
= groupNum
* 10 + digit
;
1314 if (groupNum
>= numCaptureGroups
) {
1318 if (RegexStaticSets::gStaticSets
->fRuleDigitsAlias
->contains(c
) == FALSE
) {
1324 // Scan of the back reference in the source regexp is complete. Now generate
1325 // the compiled code for it.
1326 // Because capture groups can be forward-referenced by back-references,
1327 // we fill the operand with the capture group number. At the end
1328 // of compilation, it will be changed to the variable's location.
1329 U_ASSERT(groupNum
> 0); // Shouldn't happen. '\0' begins an octal escape sequence,
1330 // and shouldn't enter this code path at all.
1332 if (fModeFlags
& UREGEX_CASE_INSENSITIVE
) {
1333 appendOp(URX_BACKREF_I
, groupNum
);
1335 appendOp(URX_BACKREF
, groupNum
);
1340 case doBeginNamedBackRef
:
1341 U_ASSERT(fCaptureName
== NULL
);
1342 fCaptureName
= new UnicodeString
;
1343 if (fCaptureName
== NULL
) {
1344 error(U_MEMORY_ALLOCATION_ERROR
);
1348 case doContinueNamedBackRef
:
1349 fCaptureName
->append(fC
.fChar
);
1352 case doCompleteNamedBackRef
:
1354 int32_t groupNumber
=
1355 fRXPat
->fNamedCaptureMap
? uhash_geti(fRXPat
->fNamedCaptureMap
, fCaptureName
) : 0;
1356 if (groupNumber
== 0) {
1357 // Group name has not been defined.
1358 // Could be a forward reference. If we choose to support them at some
1359 // future time, extra mechanism will be required at this point.
1360 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
1362 // Given the number, handle identically to a \n numbered back reference.
1363 // See comments above, under doBackRef
1365 if (fModeFlags
& UREGEX_CASE_INSENSITIVE
) {
1366 appendOp(URX_BACKREF_I
, groupNumber
);
1368 appendOp(URX_BACKREF
, groupNumber
);
1371 delete fCaptureName
;
1372 fCaptureName
= NULL
;
1376 case doPossessivePlus
:
1377 // Possessive ++ quantifier.
1380 // 2. body of stuff being iterated over
1386 // Note: TODO: This is pretty inefficient. A mass of saved state is built up
1387 // then unconditionally discarded. Perhaps introduce a new opcode. Ticket 6056
1391 int32_t topLoc
= blockTopLoc(TRUE
);
1392 int32_t stoLoc
= allocateData(1); // Reserve the data location for storing save stack ptr.
1393 int32_t op
= buildOp(URX_STO_SP
, stoLoc
);
1394 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
);
1396 // Emit the STATE_SAVE
1397 appendOp(URX_STATE_SAVE
, fRXPat
->fCompiledPat
->size()+2);
1400 appendOp(URX_JMP
, topLoc
+1);
1403 appendOp(URX_LD_SP
, stoLoc
);
1407 case doPossessiveStar
:
1408 // Possessive *+ quantifier.
1412 // 3. body of stuff being iterated over
1416 // TODO: do something to cut back the state stack each time through the loop.
1418 // Reserve two slots at the top of the block.
1419 int32_t topLoc
= blockTopLoc(TRUE
);
1423 int32_t stoLoc
= allocateData(1); // Reserve the data location for storing save stack ptr.
1424 int32_t op
= buildOp(URX_STO_SP
, stoLoc
);
1425 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
);
1427 // Emit the SAVE_STATE 5
1428 int32_t L7
= fRXPat
->fCompiledPat
->size()+1;
1429 op
= buildOp(URX_STATE_SAVE
, L7
);
1430 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
+1);
1432 // Append the JMP operation.
1433 appendOp(URX_JMP
, topLoc
+1);
1435 // Emit the LD_SP loc
1436 appendOp(URX_LD_SP
, stoLoc
);
1440 case doPossessiveOpt
:
1441 // Possessive ?+ quantifier.
1445 // 3. body of optional block
1450 // Reserve two slots at the top of the block.
1451 int32_t topLoc
= blockTopLoc(TRUE
);
1455 int32_t stoLoc
= allocateData(1); // Reserve the data location for storing save stack ptr.
1456 int32_t op
= buildOp(URX_STO_SP
, stoLoc
);
1457 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
);
1459 // Emit the SAVE_STATE
1460 int32_t continueLoc
= fRXPat
->fCompiledPat
->size()+1;
1461 op
= buildOp(URX_STATE_SAVE
, continueLoc
);
1462 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
+1);
1465 appendOp(URX_LD_SP
, stoLoc
);
1470 case doBeginMatchMode
:
1471 fNewModeFlags
= fModeFlags
;
1472 fSetModeFlag
= TRUE
;
1475 case doMatchMode
: // (?i) and similar
1479 case 0x69: /* 'i' */ bit
= UREGEX_CASE_INSENSITIVE
; break;
1480 case 0x64: /* 'd' */ bit
= UREGEX_UNIX_LINES
; break;
1481 case 0x6d: /* 'm' */ bit
= UREGEX_MULTILINE
; break;
1482 case 0x73: /* 's' */ bit
= UREGEX_DOTALL
; break;
1483 case 0x75: /* 'u' */ bit
= 0; /* Unicode casing */ break;
1484 case 0x77: /* 'w' */ bit
= UREGEX_UWORD
; break;
1485 case 0x78: /* 'x' */ bit
= UREGEX_COMMENTS
; break;
1486 case 0x2d: /* '-' */ fSetModeFlag
= FALSE
; break;
1488 UPRV_UNREACHABLE
; // Should never happen. Other chars are filtered out
1492 fNewModeFlags
|= bit
;
1494 fNewModeFlags
&= ~bit
;
1499 case doSetMatchMode
:
1500 // Emit code to match any pending literals, using the not-yet changed match mode.
1503 // We've got a (?i) or similar. The match mode is being changed, but
1504 // the change is not scoped to a parenthesized block.
1505 U_ASSERT(fNewModeFlags
< 0);
1506 fModeFlags
= fNewModeFlags
;
1511 case doMatchModeParen
:
1512 // We've got a (?i: or similar. Begin a parenthesized block, save old
1513 // mode flags so they can be restored at the close of the block.
1516 // - NOP, which later may be replaced by a save-state if the
1517 // parenthesized group gets a * quantifier, followed by
1518 // - NOP, which may later be replaced by a save-state if there
1519 // is an '|' alternation within the parens.
1522 appendOp(URX_NOP
, 0);
1523 appendOp(URX_NOP
, 0);
1525 // On the Parentheses stack, start a new frame and add the postions
1526 // of the two NOPs (a normal non-capturing () frame, except for the
1527 // saving of the orignal mode flags.)
1528 fParenStack
.push(fModeFlags
, *fStatus
);
1529 fParenStack
.push(flags
, *fStatus
); // Frame Marker
1530 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The first NOP
1531 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP
1533 // Set the current mode flags to the new values.
1534 U_ASSERT(fNewModeFlags
< 0);
1535 fModeFlags
= fNewModeFlags
;
1540 error(U_REGEX_INVALID_FLAG
);
1543 case doSuppressComments
:
1544 // We have just scanned a '(?'. We now need to prevent the character scanner from
1545 // treating a '#' as a to-the-end-of-line comment.
1546 // (This Perl compatibility just gets uglier and uglier to do...)
1547 fEOLComments
= FALSE
;
1553 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1560 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1565 case doSetBackslash_s
:
1567 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1568 set
->addAll(*RegexStaticSets::gStaticSets
->fPropSets
[URX_ISSPACE_SET
]);
1572 case doSetBackslash_S
:
1574 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1575 UnicodeSet
SSet(*RegexStaticSets::gStaticSets
->fPropSets
[URX_ISSPACE_SET
]);
1581 case doSetBackslash_d
:
1583 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1584 // TODO - make a static set, ticket 6058.
1585 addCategory(set
, U_GC_ND_MASK
, *fStatus
);
1589 case doSetBackslash_D
:
1591 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1593 // TODO - make a static set, ticket 6058.
1594 digits
.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK
, U_GC_ND_MASK
, *fStatus
);
1595 digits
.complement();
1596 set
->addAll(digits
);
1600 case doSetBackslash_h
:
1602 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1604 h
.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK
, U_GC_ZS_MASK
, *fStatus
);
1605 h
.add((UChar32
)9); // Tab
1610 case doSetBackslash_H
:
1612 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1614 h
.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK
, U_GC_ZS_MASK
, *fStatus
);
1615 h
.add((UChar32
)9); // Tab
1621 case doSetBackslash_v
:
1623 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1624 set
->add((UChar32
)0x0a, (UChar32
)0x0d); // add range
1625 set
->add((UChar32
)0x85);
1626 set
->add((UChar32
)0x2028, (UChar32
)0x2029);
1630 case doSetBackslash_V
:
1632 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1634 v
.add((UChar32
)0x0a, (UChar32
)0x0d); // add range
1635 v
.add((UChar32
)0x85);
1636 v
.add((UChar32
)0x2028, (UChar32
)0x2029);
1642 case doSetBackslash_w
:
1644 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1645 set
->addAll(*RegexStaticSets::gStaticSets
->fPropSets
[URX_ISWORD_SET
]);
1649 case doSetBackslash_W
:
1651 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1652 UnicodeSet
SSet(*RegexStaticSets::gStaticSets
->fPropSets
[URX_ISWORD_SET
]);
1660 fSetStack
.push(new UnicodeSet(), *fStatus
);
1661 fSetOpStack
.push(setStart
, *fStatus
);
1662 if ((fModeFlags
& UREGEX_CASE_INSENSITIVE
) != 0) {
1663 fSetOpStack
.push(setCaseClose
, *fStatus
);
1667 case doSetBeginDifference1
:
1668 // We have scanned something like [[abc]-[
1669 // Set up a new UnicodeSet for the set beginning with the just-scanned '['
1670 // Push a Difference operator, which will cause the new set to be subtracted from what
1671 // went before once it is created.
1672 setPushOp(setDifference1
);
1673 fSetOpStack
.push(setStart
, *fStatus
);
1674 if ((fModeFlags
& UREGEX_CASE_INSENSITIVE
) != 0) {
1675 fSetOpStack
.push(setCaseClose
, *fStatus
);
1679 case doSetBeginIntersection1
:
1680 // We have scanned something like [[abc]&[
1681 // Need both the '&' operator and the open '[' operator.
1682 setPushOp(setIntersection1
);
1683 fSetOpStack
.push(setStart
, *fStatus
);
1684 if ((fModeFlags
& UREGEX_CASE_INSENSITIVE
) != 0) {
1685 fSetOpStack
.push(setCaseClose
, *fStatus
);
1689 case doSetBeginUnion
:
1690 // We have scanned something like [[abc][
1691 // Need to handle the union operation explicitly [[abc] | [
1692 setPushOp(setUnion
);
1693 fSetOpStack
.push(setStart
, *fStatus
);
1694 if ((fModeFlags
& UREGEX_CASE_INSENSITIVE
) != 0) {
1695 fSetOpStack
.push(setCaseClose
, *fStatus
);
1699 case doSetDifference2
:
1700 // We have scanned something like [abc--
1701 // Consider this to unambiguously be a set difference operator.
1702 setPushOp(setDifference2
);
1706 // Have encountered the ']' that closes a set.
1707 // Force the evaluation of any pending operations within this set,
1708 // leave the completed set on the top of the set stack.
1710 U_ASSERT(fSetOpStack
.peeki()==setStart
);
1716 // Finished a complete set expression, including all nested sets.
1717 // The close bracket has already triggered clearing out pending set operators,
1718 // the operator stack should be empty and the operand stack should have just
1719 // one entry, the result set.
1720 U_ASSERT(fSetOpStack
.empty());
1721 UnicodeSet
*theSet
= (UnicodeSet
*)fSetStack
.pop();
1722 U_ASSERT(fSetStack
.empty());
1727 case doSetIntersection2
:
1728 // Have scanned something like [abc&&
1729 setPushOp(setIntersection2
);
1733 // Union the just-scanned literal character into the set being built.
1734 // This operation is the highest precedence set operation, so we can always do
1735 // it immediately, without waiting to see what follows. It is necessary to perform
1736 // any pending '-' or '&' operation first, because these have the same precedence
1737 // as union-ing in a literal'
1740 UnicodeSet
*s
= (UnicodeSet
*)fSetStack
.peek();
1742 fLastSetLiteral
= fC
.fChar
;
1746 case doSetLiteralEscaped
:
1747 // A back-slash escaped literal character was encountered.
1748 // Processing is the same as with setLiteral, above, with the addition of
1749 // the optional check for errors on escaped ASCII letters.
1751 if ((fModeFlags
& UREGEX_ERROR_ON_UNKNOWN_ESCAPES
) != 0 &&
1752 ((fC
.fChar
>= 0x41 && fC
.fChar
<= 0x5A) || // in [A-Z]
1753 (fC
.fChar
>= 0x61 && fC
.fChar
<= 0x7a))) { // in [a-z]
1754 error(U_REGEX_BAD_ESCAPE_SEQUENCE
);
1757 UnicodeSet
*s
= (UnicodeSet
*)fSetStack
.peek();
1759 fLastSetLiteral
= fC
.fChar
;
1763 case doSetNamedChar
:
1764 // Scanning a \N{UNICODE CHARACTER NAME}
1765 // Aside from the source of the character, the processing is identical to doSetLiteral,
1768 UChar32 c
= scanNamedChar();
1770 UnicodeSet
*s
= (UnicodeSet
*)fSetStack
.peek();
1772 fLastSetLiteral
= c
;
1776 case doSetNamedRange
:
1777 // We have scanned literal-\N{CHAR NAME}. Add the range to the set.
1778 // The left character is already in the set, and is saved in fLastSetLiteral.
1779 // The right side needs to be picked up, the scan is at the 'N'.
1780 // Lower Limit > Upper limit being an error matches both Java
1781 // and ICU UnicodeSet behavior.
1783 UChar32 c
= scanNamedChar();
1784 if (U_SUCCESS(*fStatus
) && (fLastSetLiteral
== U_SENTINEL
|| fLastSetLiteral
> c
)) {
1785 error(U_REGEX_INVALID_RANGE
);
1787 UnicodeSet
*s
= (UnicodeSet
*)fSetStack
.peek();
1788 s
->add(fLastSetLiteral
, c
);
1789 fLastSetLiteral
= c
;
1795 // Scanned a '^' at the start of a set.
1796 // Push the negation operator onto the set op stack.
1797 // A twist for case-insensitive matching:
1798 // the case closure operation must happen _before_ negation.
1799 // But the case closure operation will already be on the stack if it's required.
1800 // This requires checking for case closure, and swapping the stack order
1801 // if it is present.
1803 int32_t tosOp
= fSetOpStack
.peeki();
1804 if (tosOp
== setCaseClose
) {
1806 fSetOpStack
.push(setNegation
, *fStatus
);
1807 fSetOpStack
.push(setCaseClose
, *fStatus
);
1809 fSetOpStack
.push(setNegation
, *fStatus
);
1814 case doSetNoCloseError
:
1815 error(U_REGEX_MISSING_CLOSE_BRACKET
);
1819 error(U_REGEX_RULE_SYNTAX
); // -- or && at the end of a set. Illegal.
1822 case doSetPosixProp
:
1824 UnicodeSet
*s
= scanPosixProp();
1826 UnicodeSet
*tos
= (UnicodeSet
*)fSetStack
.peek();
1829 } // else error. scanProp() reported the error status already.
1834 // Scanned a \p \P within [brackets].
1836 UnicodeSet
*s
= scanProp();
1838 UnicodeSet
*tos
= (UnicodeSet
*)fSetStack
.peek();
1841 } // else error. scanProp() reported the error status already.
1847 // We have scanned literal-literal. Add the range to the set.
1848 // The left character is already in the set, and is saved in fLastSetLiteral.
1849 // The right side is the current character.
1850 // Lower Limit > Upper limit being an error matches both Java
1851 // and ICU UnicodeSet behavior.
1854 if (fLastSetLiteral
== U_SENTINEL
|| fLastSetLiteral
> fC
.fChar
) {
1855 error(U_REGEX_INVALID_RANGE
);
1857 UnicodeSet
*s
= (UnicodeSet
*)fSetStack
.peek();
1858 s
->add(fLastSetLiteral
, fC
.fChar
);
1866 if (U_FAILURE(*fStatus
)) {
1875 //------------------------------------------------------------------------------
1877 // literalChar We've encountered a literal character from the pattern,
1878 // or an escape sequence that reduces to a character.
1879 // Add it to the string containing all literal chars/strings from
1882 //------------------------------------------------------------------------------
1883 void RegexCompile::literalChar(UChar32 c
) {
1884 fLiteralChars
.append(c
);
1888 //------------------------------------------------------------------------------
1890 // fixLiterals When compiling something that can follow a literal
1891 // string in a pattern, emit the code to match the
1892 // accumulated literal string.
1894 // Optionally, split the last char of the string off into
1895 // a single "ONE_CHAR" operation, so that quantifiers can
1896 // apply to that char alone. Example: abc*
1897 // The * must apply to the 'c' only.
1899 //------------------------------------------------------------------------------
1900 void RegexCompile::fixLiterals(UBool split
) {
1902 // If no literal characters have been scanned but not yet had code generated
1903 // for them, nothing needs to be done.
1904 if (fLiteralChars
.length() == 0) {
1908 int32_t indexOfLastCodePoint
= fLiteralChars
.moveIndex32(fLiteralChars
.length(), -1);
1909 UChar32 lastCodePoint
= fLiteralChars
.char32At(indexOfLastCodePoint
);
1911 // Split: We need to ensure that the last item in the compiled pattern
1912 // refers only to the last literal scanned in the pattern, so that
1913 // quantifiers (*, +, etc.) affect only it, and not a longer string.
1914 // Split before case folding for case insensitive matches.
1917 fLiteralChars
.truncate(indexOfLastCodePoint
);
1918 fixLiterals(FALSE
); // Recursive call, emit code to match the first part of the string.
1919 // Note that the truncated literal string may be empty, in which case
1920 // nothing will be emitted.
1922 literalChar(lastCodePoint
); // Re-add the last code point as if it were a new literal.
1923 fixLiterals(FALSE
); // Second recursive call, code for the final code point.
1927 // If we are doing case-insensitive matching, case fold the string. This may expand
1928 // the string, e.g. the German sharp-s turns into "ss"
1929 if (fModeFlags
& UREGEX_CASE_INSENSITIVE
) {
1930 fLiteralChars
.foldCase();
1931 indexOfLastCodePoint
= fLiteralChars
.moveIndex32(fLiteralChars
.length(), -1);
1932 lastCodePoint
= fLiteralChars
.char32At(indexOfLastCodePoint
);
1935 if (indexOfLastCodePoint
== 0) {
1936 // Single character, emit a URX_ONECHAR op to match it.
1937 if ((fModeFlags
& UREGEX_CASE_INSENSITIVE
) &&
1938 u_hasBinaryProperty(lastCodePoint
, UCHAR_CASE_SENSITIVE
)) {
1939 appendOp(URX_ONECHAR_I
, lastCodePoint
);
1941 appendOp(URX_ONECHAR
, lastCodePoint
);
1944 // Two or more chars, emit a URX_STRING to match them.
1945 if (fLiteralChars
.length() > 0x00ffffff || fRXPat
->fLiteralText
.length() > 0x00ffffff) {
1946 error(U_REGEX_PATTERN_TOO_BIG
);
1948 if (fModeFlags
& UREGEX_CASE_INSENSITIVE
) {
1949 appendOp(URX_STRING_I
, fRXPat
->fLiteralText
.length());
1951 // TODO here: add optimization to split case sensitive strings of length two
1952 // into two single char ops, for efficiency.
1953 appendOp(URX_STRING
, fRXPat
->fLiteralText
.length());
1955 appendOp(URX_STRING_LEN
, fLiteralChars
.length());
1957 // Add this string into the accumulated strings of the compiled pattern.
1958 fRXPat
->fLiteralText
.append(fLiteralChars
);
1961 fLiteralChars
.remove();
1965 int32_t RegexCompile::buildOp(int32_t type
, int32_t val
) {
1966 if (U_FAILURE(*fStatus
)) {
1969 if (type
< 0 || type
> 255) {
1972 if (val
> 0x00ffffff) {
1976 if (!(type
== URX_RESERVED_OP_N
|| type
== URX_RESERVED_OP
)) {
1979 if (URX_TYPE(val
) != 0xff) {
1982 type
= URX_RESERVED_OP_N
;
1984 return (type
<< 24) | val
;
1988 //------------------------------------------------------------------------------
1990 // appendOp() Append a new instruction onto the compiled pattern
1991 // Includes error checking, limiting the size of the
1992 // pattern to lengths that can be represented in the
1993 // 24 bit operand field of an instruction.
1995 //------------------------------------------------------------------------------
1996 void RegexCompile::appendOp(int32_t op
) {
1997 if (U_FAILURE(*fStatus
)) {
2000 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
2001 if ((fRXPat
->fCompiledPat
->size() > 0x00fffff0) && U_SUCCESS(*fStatus
)) {
2002 error(U_REGEX_PATTERN_TOO_BIG
);
2006 void RegexCompile::appendOp(int32_t type
, int32_t val
) {
2007 appendOp(buildOp(type
, val
));
2011 //------------------------------------------------------------------------------
2013 // insertOp() Insert a slot for a new opcode into the already
2014 // compiled pattern code.
2016 // Fill the slot with a NOP. Our caller will replace it
2017 // with what they really wanted.
2019 //------------------------------------------------------------------------------
2020 void RegexCompile::insertOp(int32_t where
) {
2021 UVector64
*code
= fRXPat
->fCompiledPat
;
2022 U_ASSERT(where
>0 && where
< code
->size());
2024 int32_t nop
= buildOp(URX_NOP
, 0);
2025 code
->insertElementAt(nop
, where
, *fStatus
);
2027 // Walk through the pattern, looking for any ops with targets that
2028 // were moved down by the insert. Fix them.
2030 for (loc
=0; loc
<code
->size(); loc
++) {
2031 int32_t op
= (int32_t)code
->elementAti(loc
);
2032 int32_t opType
= URX_TYPE(op
);
2033 int32_t opValue
= URX_VAL(op
);
2034 if ((opType
== URX_JMP
||
2035 opType
== URX_JMPX
||
2036 opType
== URX_STATE_SAVE
||
2037 opType
== URX_CTR_LOOP
||
2038 opType
== URX_CTR_LOOP_NG
||
2039 opType
== URX_JMP_SAV
||
2040 opType
== URX_JMP_SAV_X
||
2041 opType
== URX_RELOC_OPRND
) && opValue
> where
) {
2042 // Target location for this opcode is after the insertion point and
2043 // needs to be incremented to adjust for the insertion.
2045 op
= buildOp(opType
, opValue
);
2046 code
->setElementAt(op
, loc
);
2050 // Now fix up the parentheses stack. All positive values in it are locations in
2051 // the compiled pattern. (Negative values are frame boundaries, and don't need fixing.)
2052 for (loc
=0; loc
<fParenStack
.size(); loc
++) {
2053 int32_t x
= fParenStack
.elementAti(loc
);
2054 U_ASSERT(x
< code
->size());
2057 fParenStack
.setElementAt(x
, loc
);
2061 if (fMatchCloseParen
> where
) {
2064 if (fMatchOpenParen
> where
) {
2070 //------------------------------------------------------------------------------
2072 // allocateData() Allocate storage in the matcher's static data area.
2073 // Return the index for the newly allocated data.
2074 // The storage won't actually exist until we are running a match
2075 // operation, but the storage indexes are inserted into various
2076 // opcodes while compiling the pattern.
2078 //------------------------------------------------------------------------------
2079 int32_t RegexCompile::allocateData(int32_t size
) {
2080 if (U_FAILURE(*fStatus
)) {
2083 if (size
<= 0 || size
> 0x100 || fRXPat
->fDataSize
< 0) {
2084 error(U_REGEX_INTERNAL_ERROR
);
2087 int32_t dataIndex
= fRXPat
->fDataSize
;
2088 fRXPat
->fDataSize
+= size
;
2089 if (fRXPat
->fDataSize
>= 0x00fffff0) {
2090 error(U_REGEX_INTERNAL_ERROR
);
2096 //------------------------------------------------------------------------------
2098 // allocateStackData() Allocate space in the back-tracking stack frame.
2099 // Return the index for the newly allocated data.
2100 // The frame indexes are inserted into various
2101 // opcodes while compiling the pattern, meaning that frame
2102 // size must be restricted to the size that will fit
2103 // as an operand (24 bits).
2105 //------------------------------------------------------------------------------
2106 int32_t RegexCompile::allocateStackData(int32_t size
) {
2107 if (U_FAILURE(*fStatus
)) {
2110 if (size
<= 0 || size
> 0x100 || fRXPat
->fFrameSize
< 0) {
2111 error(U_REGEX_INTERNAL_ERROR
);
2114 int32_t dataIndex
= fRXPat
->fFrameSize
;
2115 fRXPat
->fFrameSize
+= size
;
2116 if (fRXPat
->fFrameSize
>= 0x00fffff0) {
2117 error(U_REGEX_PATTERN_TOO_BIG
);
2123 //------------------------------------------------------------------------------
2125 // blockTopLoc() Find or create a location in the compiled pattern
2126 // at the start of the operation or block that has
2127 // just been compiled. Needed when a quantifier (* or
2128 // whatever) appears, and we need to add an operation
2129 // at the start of the thing being quantified.
2131 // (Parenthesized Blocks) have a slot with a NOP that
2132 // is reserved for this purpose. .* or similar don't
2133 // and a slot needs to be added.
2135 // parameter reserveLoc : TRUE - ensure that there is space to add an opcode
2136 // at the returned location.
2137 // FALSE - just return the address,
2138 // do not reserve a location there.
2140 //------------------------------------------------------------------------------
2141 int32_t RegexCompile::blockTopLoc(UBool reserveLoc
) {
2143 fixLiterals(TRUE
); // Emit code for any pending literals.
2144 // If last item was a string, emit separate op for the its last char.
2145 if (fRXPat
->fCompiledPat
->size() == fMatchCloseParen
)
2147 // The item just processed is a parenthesized block.
2148 theLoc
= fMatchOpenParen
; // A slot is already reserved for us.
2149 U_ASSERT(theLoc
> 0);
2150 U_ASSERT(URX_TYPE(((uint32_t)fRXPat
->fCompiledPat
->elementAti(theLoc
))) == URX_NOP
);
2153 // Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
2154 // No slot for STATE_SAVE was pre-reserved in the compiled code.
2155 // We need to make space now.
2156 theLoc
= fRXPat
->fCompiledPat
->size()-1;
2157 int32_t opAtTheLoc
= (int32_t)fRXPat
->fCompiledPat
->elementAti(theLoc
);
2158 if (URX_TYPE(opAtTheLoc
) == URX_STRING_LEN
) {
2159 // Strings take two opcode, we want the position of the first one.
2160 // We can have a string at this point if a single character case-folded to two.
2164 int32_t nop
= buildOp(URX_NOP
, 0);
2165 fRXPat
->fCompiledPat
->insertElementAt(nop
, theLoc
, *fStatus
);
2173 //------------------------------------------------------------------------------
2175 // handleCloseParen When compiling a close paren, we need to go back
2176 // and fix up any JMP or SAVE operations within the
2177 // parenthesized block that need to target the end
2178 // of the block. The locations of these are kept on
2179 // the paretheses stack.
2181 // This function is called both when encountering a
2182 // real ) and at the end of the pattern.
2184 //------------------------------------------------------------------------------
2185 void RegexCompile::handleCloseParen() {
2188 if (fParenStack
.size() <= 0) {
2189 error(U_REGEX_MISMATCHED_PAREN
);
2193 // Emit code for any pending literals.
2196 // Fixup any operations within the just-closed parenthesized group
2197 // that need to reference the end of the (block).
2198 // (The first one popped from the stack is an unused slot for
2199 // alternation (OR) state save, but applying the fixup to it does no harm.)
2201 patIdx
= fParenStack
.popi();
2203 // value < 0 flags the start of the frame on the paren stack.
2206 U_ASSERT(patIdx
>0 && patIdx
<= fRXPat
->fCompiledPat
->size());
2207 patOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(patIdx
);
2208 U_ASSERT(URX_VAL(patOp
) == 0); // Branch target for JMP should not be set.
2209 patOp
|= fRXPat
->fCompiledPat
->size(); // Set it now.
2210 fRXPat
->fCompiledPat
->setElementAt(patOp
, patIdx
);
2211 fMatchOpenParen
= patIdx
;
2214 // At the close of any parenthesized block, restore the match mode flags to
2215 // the value they had at the open paren. Saved value is
2216 // at the top of the paren stack.
2217 fModeFlags
= fParenStack
.popi();
2218 U_ASSERT(fModeFlags
< 0);
2220 // DO any additional fixups, depending on the specific kind of
2221 // parentesized grouping this is
2226 // No additional fixups required.
2227 // (Grouping-only parentheses)
2230 // Capturing Parentheses.
2231 // Insert a End Capture op into the pattern.
2232 // The frame offset of the variables for this cg is obtained from the
2233 // start capture op and put it into the end-capture op.
2235 int32_t captureOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
+1);
2236 U_ASSERT(URX_TYPE(captureOp
) == URX_START_CAPTURE
);
2238 int32_t frameVarLocation
= URX_VAL(captureOp
);
2239 appendOp(URX_END_CAPTURE
, frameVarLocation
);
2243 // Atomic Parenthesis.
2244 // Insert a LD_SP operation to restore the state stack to the position
2245 // it was when the atomic parens were entered.
2247 int32_t stoOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
+1);
2248 U_ASSERT(URX_TYPE(stoOp
) == URX_STO_SP
);
2249 int32_t stoLoc
= URX_VAL(stoOp
);
2250 appendOp(URX_LD_SP
, stoLoc
);
2256 int32_t startOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
-5);
2257 U_ASSERT(URX_TYPE(startOp
) == URX_LA_START
);
2258 int32_t dataLoc
= URX_VAL(startOp
);
2259 appendOp(URX_LA_END
, dataLoc
);
2265 // See comment at doOpenLookAheadNeg
2266 int32_t startOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
-1);
2267 U_ASSERT(URX_TYPE(startOp
) == URX_LA_START
);
2268 int32_t dataLoc
= URX_VAL(startOp
);
2269 appendOp(URX_LA_END
, dataLoc
);
2270 appendOp(URX_BACKTRACK
, 0);
2271 appendOp(URX_LA_END
, dataLoc
);
2273 // Patch the URX_SAVE near the top of the block.
2274 // The destination of the SAVE is the final LA_END that was just added.
2275 int32_t saveOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
);
2276 U_ASSERT(URX_TYPE(saveOp
) == URX_STATE_SAVE
);
2277 int32_t dest
= fRXPat
->fCompiledPat
->size()-1;
2278 saveOp
= buildOp(URX_STATE_SAVE
, dest
);
2279 fRXPat
->fCompiledPat
->setElementAt(saveOp
, fMatchOpenParen
);
2285 // See comment at doOpenLookBehind.
2287 // Append the URX_LB_END and URX_LA_END to the compiled pattern.
2288 int32_t startOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
-4);
2289 U_ASSERT(URX_TYPE(startOp
) == URX_LB_START
);
2290 int32_t dataLoc
= URX_VAL(startOp
);
2291 appendOp(URX_LB_END
, dataLoc
);
2292 appendOp(URX_LA_END
, dataLoc
);
2294 // Determine the min and max bounds for the length of the
2295 // string that the pattern can match.
2296 // An unbounded upper limit is an error.
2297 int32_t patEnd
= fRXPat
->fCompiledPat
->size() - 1;
2298 int32_t minML
= minMatchLength(fMatchOpenParen
, patEnd
);
2299 int32_t maxML
= maxMatchLength(fMatchOpenParen
, patEnd
);
2300 if (URX_TYPE(maxML
) != 0) {
2301 error(U_REGEX_LOOK_BEHIND_LIMIT
);
2304 if (maxML
== INT32_MAX
) {
2305 error(U_REGEX_LOOK_BEHIND_LIMIT
);
2308 if (minML
== INT32_MAX
) {
2309 // This condition happens when no match is possible, such as with a
2310 // [set] expression containing no elements.
2311 // In principle, the generated code to evaluate the expression could be deleted,
2312 // but it's probably not worth the complication.
2315 U_ASSERT(minML
<= maxML
);
2317 // Insert the min and max match len bounds into the URX_LB_CONT op that
2318 // appears at the top of the look-behind block, at location fMatchOpenParen+1
2319 fRXPat
->fCompiledPat
->setElementAt(minML
, fMatchOpenParen
-2);
2320 fRXPat
->fCompiledPat
->setElementAt(maxML
, fMatchOpenParen
-1);
2329 // See comment at doOpenLookBehindNeg.
2331 // Append the URX_LBN_END to the compiled pattern.
2332 int32_t startOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
-5);
2333 U_ASSERT(URX_TYPE(startOp
) == URX_LB_START
);
2334 int32_t dataLoc
= URX_VAL(startOp
);
2335 appendOp(URX_LBN_END
, dataLoc
);
2337 // Determine the min and max bounds for the length of the
2338 // string that the pattern can match.
2339 // An unbounded upper limit is an error.
2340 int32_t patEnd
= fRXPat
->fCompiledPat
->size() - 1;
2341 int32_t minML
= minMatchLength(fMatchOpenParen
, patEnd
);
2342 int32_t maxML
= maxMatchLength(fMatchOpenParen
, patEnd
);
2343 if (URX_TYPE(maxML
) != 0) {
2344 error(U_REGEX_LOOK_BEHIND_LIMIT
);
2347 if (maxML
== INT32_MAX
) {
2348 error(U_REGEX_LOOK_BEHIND_LIMIT
);
2351 if (minML
== INT32_MAX
) {
2352 // This condition happens when no match is possible, such as with a
2353 // [set] expression containing no elements.
2354 // In principle, the generated code to evaluate the expression could be deleted,
2355 // but it's probably not worth the complication.
2359 U_ASSERT(minML
<= maxML
);
2361 // Insert the min and max match len bounds into the URX_LB_CONT op that
2362 // appears at the top of the look-behind block, at location fMatchOpenParen+1
2363 fRXPat
->fCompiledPat
->setElementAt(minML
, fMatchOpenParen
-3);
2364 fRXPat
->fCompiledPat
->setElementAt(maxML
, fMatchOpenParen
-2);
2366 // Insert the pattern location to continue at after a successful match
2367 // as the last operand of the URX_LBN_CONT
2368 int32_t op
= buildOp(URX_RELOC_OPRND
, fRXPat
->fCompiledPat
->size());
2369 fRXPat
->fCompiledPat
->setElementAt(op
, fMatchOpenParen
-1);
2379 // remember the next location in the compiled pattern.
2380 // The compilation of Quantifiers will look at this to see whether its looping
2381 // over a parenthesized block or a single item
2382 fMatchCloseParen
= fRXPat
->fCompiledPat
->size();
2387 //------------------------------------------------------------------------------
2389 // compileSet Compile the pattern operations for a reference to a
2392 //------------------------------------------------------------------------------
2393 void RegexCompile::compileSet(UnicodeSet
*theSet
)
2395 if (theSet
== NULL
) {
2398 // Remove any strings from the set.
2399 // There shoudn't be any, but just in case.
2400 // (Case Closure can add them; if we had a simple case closure avaialble that
2401 // ignored strings, that would be better.)
2402 theSet
->removeAllStrings();
2403 int32_t setSize
= theSet
->size();
2408 // Set of no elements. Always fails to match.
2409 appendOp(URX_BACKTRACK
, 0);
2416 // The set contains only a single code point. Put it into
2417 // the compiled pattern as a single char operation rather
2418 // than a set, and discard the set itself.
2419 literalChar(theSet
->charAt(0));
2426 // The set contains two or more chars. (the normal case)
2427 // Put it into the compiled pattern as a set.
2428 int32_t setNumber
= fRXPat
->fSets
->size();
2429 fRXPat
->fSets
->addElement(theSet
, *fStatus
);
2430 appendOp(URX_SETREF
, setNumber
);
2436 //------------------------------------------------------------------------------
2438 // compileInterval Generate the code for a {min, max} style interval quantifier.
2439 // Except for the specific opcodes used, the code is the same
2440 // for all three types (greedy, non-greedy, possessive) of
2441 // intervals. The opcodes are supplied as parameters.
2442 // (There are two sets of opcodes - greedy & possessive use the
2443 // same ones, while non-greedy has it's own.)
2445 // The code for interval loops has this form:
2446 // 0 CTR_INIT counter loc (in stack frame)
2447 // 1 5 patt address of CTR_LOOP at bottom of block
2449 // 3 max count (-1 for unbounded)
2450 // 4 ... block to be iterated over
2454 //------------------------------------------------------------------------------
2455 void RegexCompile::compileInterval(int32_t InitOp
, int32_t LoopOp
)
2457 // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
2458 // four slots in the compiled code. Reserve them.
2459 int32_t topOfBlock
= blockTopLoc(TRUE
);
2460 insertOp(topOfBlock
);
2461 insertOp(topOfBlock
);
2462 insertOp(topOfBlock
);
2464 // The operands for the CTR_INIT opcode include the index in the matcher data
2465 // of the counter. Allocate it now. There are two data items
2466 // counterLoc --> Loop counter
2467 // +1 --> Input index (for breaking non-progressing loops)
2468 // (Only present if unbounded upper limit on loop)
2469 int32_t dataSize
= fIntervalUpper
< 0 ? 2 : 1;
2470 int32_t counterLoc
= allocateStackData(dataSize
);
2472 int32_t op
= buildOp(InitOp
, counterLoc
);
2473 fRXPat
->fCompiledPat
->setElementAt(op
, topOfBlock
);
2475 // The second operand of CTR_INIT is the location following the end of the loop.
2476 // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
2477 // compilation of something later on causes the code to grow and the target
2478 // position to move.
2479 int32_t loopEnd
= fRXPat
->fCompiledPat
->size();
2480 op
= buildOp(URX_RELOC_OPRND
, loopEnd
);
2481 fRXPat
->fCompiledPat
->setElementAt(op
, topOfBlock
+1);
2483 // Followed by the min and max counts.
2484 fRXPat
->fCompiledPat
->setElementAt(fIntervalLow
, topOfBlock
+2);
2485 fRXPat
->fCompiledPat
->setElementAt(fIntervalUpper
, topOfBlock
+3);
2487 // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op.
2488 // Goes at end of the block being looped over, so just append to the code so far.
2489 appendOp(LoopOp
, topOfBlock
);
2491 if ((fIntervalLow
& 0xff000000) != 0 ||
2492 (fIntervalUpper
> 0 && (fIntervalUpper
& 0xff000000) != 0)) {
2493 error(U_REGEX_NUMBER_TOO_BIG
);
2496 if (fIntervalLow
> fIntervalUpper
&& fIntervalUpper
!= -1) {
2497 error(U_REGEX_MAX_LT_MIN
);
2503 UBool
RegexCompile::compileInlineInterval() {
2504 if (fIntervalUpper
> 10 || fIntervalUpper
< fIntervalLow
) {
2505 // Too big to inline. Fail, which will cause looping code to be generated.
2506 // (Upper < Lower picks up unbounded upper and errors, both.)
2510 int32_t topOfBlock
= blockTopLoc(FALSE
);
2511 if (fIntervalUpper
== 0) {
2512 // Pathological case. Attempt no matches, as if the block doesn't exist.
2513 // Discard the generated code for the block.
2514 // If the block included parens, discard the info pertaining to them as well.
2515 fRXPat
->fCompiledPat
->setSize(topOfBlock
);
2516 if (fMatchOpenParen
>= topOfBlock
) {
2517 fMatchOpenParen
= -1;
2519 if (fMatchCloseParen
>= topOfBlock
) {
2520 fMatchCloseParen
= -1;
2525 if (topOfBlock
!= fRXPat
->fCompiledPat
->size()-1 && fIntervalUpper
!= 1) {
2526 // The thing being repeated is not a single op, but some
2527 // more complex block. Do it as a loop, not inlines.
2528 // Note that things "repeated" a max of once are handled as inline, because
2529 // the one copy of the code already generated is just fine.
2533 // Pick up the opcode that is to be repeated
2535 int32_t op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(topOfBlock
);
2537 // Compute the pattern location where the inline sequence
2538 // will end, and set up the state save op that will be needed.
2540 int32_t endOfSequenceLoc
= fRXPat
->fCompiledPat
->size()-1
2541 + fIntervalUpper
+ (fIntervalUpper
-fIntervalLow
);
2542 int32_t saveOp
= buildOp(URX_STATE_SAVE
, endOfSequenceLoc
);
2543 if (fIntervalLow
== 0) {
2544 insertOp(topOfBlock
);
2545 fRXPat
->fCompiledPat
->setElementAt(saveOp
, topOfBlock
);
2550 // Loop, emitting the op for the thing being repeated each time.
2551 // Loop starts at 1 because one instance of the op already exists in the pattern,
2552 // it was put there when it was originally encountered.
2554 for (i
=1; i
<fIntervalUpper
; i
++ ) {
2555 if (i
>= fIntervalLow
) {
2565 //------------------------------------------------------------------------------
2567 // caseInsensitiveStart given a single code point from a pattern string, determine the
2568 // set of characters that could potentially begin a case-insensitive
2569 // match of a string beginning with that character, using full Unicode
2570 // case insensitive matching.
2572 // This is used in optimizing find().
2574 // closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
2575 // misses cases like this:
2576 // A string from the pattern begins with 'ss' (although all we know
2577 // in this context is that it begins with 's')
2578 // The pattern could match a string beginning with a German sharp-s
2580 // To the ordinary case closure for a character c, we add all other
2581 // characters cx where the case closure of cx incudes a string form that begins
2582 // with the original character c.
2584 // This function could be made smarter. The full pattern string is available
2585 // and it would be possible to verify that the extra characters being added
2586 // to the starting set fully match, rather than having just a first-char of the
2587 // folded form match.
2589 //------------------------------------------------------------------------------
2590 void RegexCompile::findCaseInsensitiveStarters(UChar32 c
, UnicodeSet
*starterChars
) {
2592 // Machine Generated below.
2593 // It may need updating with new versions of Unicode.
2594 // Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed.
2595 // The update tool is here: svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
2597 // Machine Generated Data. Do not hand edit.
2598 static const UChar32 RECaseFixCodePoints
[] = {
2599 0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc,
2600 0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565,
2601 0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07,
2602 0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61,
2603 0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000};
2605 static const int16_t RECaseFixStringOffsets
[] = {
2606 0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10,
2607 0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f,
2608 0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43,
2609 0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57,
2610 0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0};
2612 static const int16_t RECaseFixCounts
[] = {
2613 0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1,
2614 0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1,
2615 0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2616 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2617 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0};
2619 static const UChar RECaseFixData
[] = {
2620 0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf,
2621 0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3,
2622 0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3,
2623 0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0x1fe3,
2624 0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0xfb14,
2625 0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83,
2626 0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90,
2627 0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95,
2628 0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2,
2629 0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7,
2630 0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0};
2632 // End of machine generated data.
2634 if (c
< UCHAR_MIN_VALUE
|| c
> UCHAR_MAX_VALUE
) {
2635 // This function should never be called with an invalid input character.
2637 } else if (u_hasBinaryProperty(c
, UCHAR_CASE_SENSITIVE
)) {
2638 UChar32 caseFoldedC
= u_foldCase(c
, U_FOLD_CASE_DEFAULT
);
2639 starterChars
->set(caseFoldedC
, caseFoldedC
);
2642 for (i
=0; RECaseFixCodePoints
[i
]<c
; i
++) {
2643 // Simple linear search through the sorted list of interesting code points.
2646 if (RECaseFixCodePoints
[i
] == c
) {
2647 int32_t dataIndex
= RECaseFixStringOffsets
[i
];
2648 int32_t numCharsToAdd
= RECaseFixCounts
[i
];
2649 UChar32 cpToAdd
= 0;
2650 for (int32_t j
=0; j
<numCharsToAdd
; j
++) {
2651 U16_NEXT_UNSAFE(RECaseFixData
, dataIndex
, cpToAdd
);
2652 starterChars
->add(cpToAdd
);
2656 starterChars
->closeOver(USET_CASE_INSENSITIVE
);
2657 starterChars
->removeAllStrings();
2659 // Not a cased character. Just return it alone.
2660 starterChars
->set(c
, c
);
2665 // Increment with overflow check.
2666 // val and delta will both be positive.
2668 static int32_t safeIncrement(int32_t val
, int32_t delta
) {
2669 if (INT32_MAX
- val
> delta
) {
2677 //------------------------------------------------------------------------------
2679 // matchStartType Determine how a match can start.
2680 // Used to optimize find() operations.
2682 // Operation is very similar to minMatchLength(). Walk the compiled
2683 // pattern, keeping an on-going minimum-match-length. For any
2684 // op where the min match coming in is zero, add that ops possible
2685 // starting matches to the possible starts for the overall pattern.
2687 //------------------------------------------------------------------------------
2688 void RegexCompile::matchStartType() {
2689 if (U_FAILURE(*fStatus
)) {
2694 int32_t loc
; // Location in the pattern of the current op being processed.
2695 int32_t op
; // The op being processed
2696 int32_t opType
; // The opcode type of the op
2697 int32_t currentLen
= 0; // Minimum length of a match to this point (loc) in the pattern
2698 int32_t numInitialStrings
= 0; // Number of strings encountered that could match at start.
2700 UBool atStart
= TRUE
; // True if no part of the pattern yet encountered
2701 // could have advanced the position in a match.
2702 // (Maximum match length so far == 0)
2704 // forwardedLength is a vector holding minimum-match-length values that
2705 // are propagated forward in the pattern by JMP or STATE_SAVE operations.
2706 // It must be one longer than the pattern being checked because some ops
2707 // will jmp to a end-of-block+1 location from within a block, and we must
2708 // count those when checking the block.
2709 int32_t end
= fRXPat
->fCompiledPat
->size();
2710 UVector32
forwardedLength(end
+1, *fStatus
);
2711 forwardedLength
.setSize(end
+1);
2712 for (loc
=3; loc
<end
; loc
++) {
2713 forwardedLength
.setElementAt(INT32_MAX
, loc
);
2716 for (loc
= 3; loc
<end
; loc
++) {
2717 op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
2718 opType
= URX_TYPE(op
);
2720 // The loop is advancing linearly through the pattern.
2721 // If the op we are now at was the destination of a branch in the pattern,
2722 // and that path has a shorter minimum length than the current accumulated value,
2723 // replace the current accumulated value.
2724 if (forwardedLength
.elementAti(loc
) < currentLen
) {
2725 currentLen
= forwardedLength
.elementAti(loc
);
2726 U_ASSERT(currentLen
>=0 && currentLen
< INT32_MAX
);
2730 // Ops that don't change the total length matched
2731 case URX_RESERVED_OP
:
2734 case URX_STRING_LEN
:
2736 case URX_START_CAPTURE
:
2737 case URX_END_CAPTURE
:
2738 case URX_BACKSLASH_B
:
2739 case URX_BACKSLASH_BU
:
2740 case URX_BACKSLASH_G
:
2741 case URX_BACKSLASH_Z
:
2746 case URX_RELOC_OPRND
:
2747 case URX_STO_INP_LOC
:
2748 case URX_BACKREF
: // BackRef. Must assume that it might be a zero length match
2751 case URX_STO_SP
: // Setup for atomic or possessive blocks. Doesn't change what can match.
2757 fRXPat
->fStartType
= START_START
;
2762 case URX_CARET_M_UNIX
:
2764 fRXPat
->fStartType
= START_LINE
;
2769 if (currentLen
== 0) {
2770 // This character could appear at the start of a match.
2771 // Add it to the set of possible starting characters.
2772 fRXPat
->fInitialChars
->add(URX_VAL(op
));
2773 numInitialStrings
+= 2;
2775 currentLen
= safeIncrement(currentLen
, 1);
2781 if (currentLen
== 0) {
2782 int32_t sn
= URX_VAL(op
);
2783 U_ASSERT(sn
> 0 && sn
< fRXPat
->fSets
->size());
2784 const UnicodeSet
*s
= (UnicodeSet
*)fRXPat
->fSets
->elementAt(sn
);
2785 fRXPat
->fInitialChars
->addAll(*s
);
2786 numInitialStrings
+= 2;
2788 currentLen
= safeIncrement(currentLen
, 1);
2793 // [Set]*, like a SETREF, above, in what it can match,
2794 // but may not match at all, so currentLen is not incremented.
2795 if (currentLen
== 0) {
2796 int32_t sn
= URX_VAL(op
);
2797 U_ASSERT(sn
> 0 && sn
< fRXPat
->fSets
->size());
2798 const UnicodeSet
*s
= (UnicodeSet
*)fRXPat
->fSets
->elementAt(sn
);
2799 fRXPat
->fInitialChars
->addAll(*s
);
2800 numInitialStrings
+= 2;
2805 case URX_LOOP_DOT_I
:
2806 if (currentLen
== 0) {
2807 // .* at the start of a pattern.
2808 // Any character can begin the match.
2809 fRXPat
->fInitialChars
->clear();
2810 fRXPat
->fInitialChars
->complement();
2811 numInitialStrings
+= 2;
2817 case URX_STATIC_SETREF
:
2818 if (currentLen
== 0) {
2819 int32_t sn
= URX_VAL(op
);
2820 U_ASSERT(sn
>0 && sn
<URX_LAST_SET
);
2821 const UnicodeSet
*s
= fRXPat
->fStaticSets
[sn
];
2822 fRXPat
->fInitialChars
->addAll(*s
);
2823 numInitialStrings
+= 2;
2825 currentLen
= safeIncrement(currentLen
, 1);
2831 case URX_STAT_SETREF_N
:
2832 if (currentLen
== 0) {
2833 int32_t sn
= URX_VAL(op
);
2834 const UnicodeSet
*s
= fRXPat
->fStaticSets
[sn
];
2837 fRXPat
->fInitialChars
->addAll(sc
);
2838 numInitialStrings
+= 2;
2840 currentLen
= safeIncrement(currentLen
, 1);
2846 case URX_BACKSLASH_D
:
2848 if (currentLen
== 0) {
2850 s
.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK
, U_GC_ND_MASK
, *fStatus
);
2851 if (URX_VAL(op
) != 0) {
2854 fRXPat
->fInitialChars
->addAll(s
);
2855 numInitialStrings
+= 2;
2857 currentLen
= safeIncrement(currentLen
, 1);
2862 case URX_BACKSLASH_H
:
2863 // Horiz white space
2864 if (currentLen
== 0) {
2866 s
.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK
, U_GC_ZS_MASK
, *fStatus
);
2867 s
.add((UChar32
)9); // Tab
2868 if (URX_VAL(op
) != 0) {
2871 fRXPat
->fInitialChars
->addAll(s
);
2872 numInitialStrings
+= 2;
2874 currentLen
= safeIncrement(currentLen
, 1);
2879 case URX_BACKSLASH_R
: // Any line ending sequence
2880 case URX_BACKSLASH_V
: // Any line ending code point, with optional negation
2881 if (currentLen
== 0) {
2883 s
.add((UChar32
)0x0a, (UChar32
)0x0d); // add range
2884 s
.add((UChar32
)0x85);
2885 s
.add((UChar32
)0x2028, (UChar32
)0x2029);
2886 if (URX_VAL(op
) != 0) {
2887 // Complement option applies to URX_BACKSLASH_V only.
2890 fRXPat
->fInitialChars
->addAll(s
);
2891 numInitialStrings
+= 2;
2893 currentLen
= safeIncrement(currentLen
, 1);
2900 // Case Insensitive Single Character.
2901 if (currentLen
== 0) {
2902 UChar32 c
= URX_VAL(op
);
2903 if (u_hasBinaryProperty(c
, UCHAR_CASE_SENSITIVE
)) {
2904 UnicodeSet
starters(c
, c
);
2905 starters
.closeOver(USET_CASE_INSENSITIVE
);
2906 // findCaseInsensitiveStarters(c, &starters);
2907 // For ONECHAR_I, no need to worry about text chars that expand on folding into strings.
2908 // The expanded folding can't match the pattern.
2909 fRXPat
->fInitialChars
->addAll(starters
);
2911 // Char has no case variants. Just add it as-is to the
2912 // set of possible starting chars.
2913 fRXPat
->fInitialChars
->add(c
);
2915 numInitialStrings
+= 2;
2917 currentLen
= safeIncrement(currentLen
, 1);
2922 case URX_BACKSLASH_X
: // Grahpeme Cluster. Minimum is 1, max unbounded.
2923 case URX_DOTANY_ALL
: // . matches one or two.
2925 case URX_DOTANY_UNIX
:
2926 if (currentLen
== 0) {
2927 // These constructs are all bad news when they appear at the start
2928 // of a match. Any character can begin the match.
2929 fRXPat
->fInitialChars
->clear();
2930 fRXPat
->fInitialChars
->complement();
2931 numInitialStrings
+= 2;
2933 currentLen
= safeIncrement(currentLen
, 1);
2939 loc
++; // Except for extra operand on URX_JMPX, same as URX_JMP.
2943 int32_t jmpDest
= URX_VAL(op
);
2944 if (jmpDest
< loc
) {
2945 // Loop of some kind. Can safely ignore, the worst that will happen
2946 // is that we understate the true minimum length
2947 currentLen
= forwardedLength
.elementAti(loc
+1);
2950 // Forward jump. Propagate the current min length to the target loc of the jump.
2951 U_ASSERT(jmpDest
<= end
+1);
2952 if (forwardedLength
.elementAti(jmpDest
) > currentLen
) {
2953 forwardedLength
.setElementAt(currentLen
, jmpDest
);
2962 // Combo of state save to the next loc, + jmp backwards.
2963 // Net effect on min. length computation is nothing.
2968 // Fails are kind of like a branch, except that the min length was
2969 // propagated already, by the state save.
2970 currentLen
= forwardedLength
.elementAti(loc
+1);
2975 case URX_STATE_SAVE
:
2977 // State Save, for forward jumps, propagate the current minimum.
2978 // of the state save.
2979 int32_t jmpDest
= URX_VAL(op
);
2980 if (jmpDest
> loc
) {
2981 if (currentLen
< forwardedLength
.elementAti(jmpDest
)) {
2982 forwardedLength
.setElementAt(currentLen
, jmpDest
);
2995 int32_t stringLenOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
2996 int32_t stringLen
= URX_VAL(stringLenOp
);
2997 U_ASSERT(URX_TYPE(stringLenOp
) == URX_STRING_LEN
);
2998 U_ASSERT(stringLenOp
>= 2);
2999 if (currentLen
== 0) {
3000 // Add the starting character of this string to the set of possible starting
3001 // characters for this pattern.
3002 int32_t stringStartIdx
= URX_VAL(op
);
3003 UChar32 c
= fRXPat
->fLiteralText
.char32At(stringStartIdx
);
3004 fRXPat
->fInitialChars
->add(c
);
3006 // Remember this string. After the entire pattern has been checked,
3007 // if nothing else is identified that can start a match, we'll use it.
3008 numInitialStrings
++;
3009 fRXPat
->fInitialStringIdx
= stringStartIdx
;
3010 fRXPat
->fInitialStringLen
= stringLen
;
3013 currentLen
= safeIncrement(currentLen
, stringLen
);
3020 // Case-insensitive string. Unlike exact-match strings, we won't
3021 // attempt a string search for possible match positions. But we
3022 // do update the set of possible starting characters.
3024 int32_t stringLenOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
3025 int32_t stringLen
= URX_VAL(stringLenOp
);
3026 U_ASSERT(URX_TYPE(stringLenOp
) == URX_STRING_LEN
);
3027 U_ASSERT(stringLenOp
>= 2);
3028 if (currentLen
== 0) {
3029 // Add the starting character of this string to the set of possible starting
3030 // characters for this pattern.
3031 int32_t stringStartIdx
= URX_VAL(op
);
3032 UChar32 c
= fRXPat
->fLiteralText
.char32At(stringStartIdx
);
3034 findCaseInsensitiveStarters(c
, &s
);
3035 fRXPat
->fInitialChars
->addAll(s
);
3036 numInitialStrings
+= 2; // Matching on an initial string not possible.
3038 currentLen
= safeIncrement(currentLen
, stringLen
);
3044 case URX_CTR_INIT_NG
:
3046 // Loop Init Ops. These don't change the min length, but they are 4 word ops
3047 // so location must be updated accordingly.
3049 // If the min loop count == 0
3050 // move loc forwards to the end of the loop, skipping over the body.
3051 // If the min count is > 0,
3052 // continue normal processing of the body of the loop.
3053 int32_t loopEndLoc
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
+1);
3054 loopEndLoc
= URX_VAL(loopEndLoc
);
3055 int32_t minLoopCount
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
+2);
3056 if (minLoopCount
== 0) {
3057 // Min Loop Count of 0, treat like a forward branch and
3058 // move the current minimum length up to the target
3059 // (end of loop) location.
3060 U_ASSERT(loopEndLoc
<= end
+1);
3061 if (forwardedLength
.elementAti(loopEndLoc
) > currentLen
) {
3062 forwardedLength
.setElementAt(currentLen
, loopEndLoc
);
3065 loc
+=3; // Skips over operands of CTR_INIT
3072 case URX_CTR_LOOP_NG
:
3074 // The jump is conditional, backwards only.
3079 // More loop ops. These state-save to themselves.
3080 // don't change the minimum match
3088 // Look-around. Scan forward until the matching look-ahead end,
3089 // without processing the look-around block. This is overly pessimistic.
3091 // Keep track of the nesting depth of look-around blocks. Boilerplate code for
3092 // lookahead contains two LA_END instructions, so count goes up by two
3093 // for each LA_START.
3094 int32_t depth
= (opType
== URX_LA_START
? 2: 1);
3097 op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
3098 if (URX_TYPE(op
) == URX_LA_START
) {
3101 if (URX_TYPE(op
) == URX_LB_START
) {
3104 if (URX_TYPE(op
) == URX_LA_END
|| URX_TYPE(op
)==URX_LBN_END
) {
3110 if (URX_TYPE(op
) == URX_STATE_SAVE
) {
3111 // Need this because neg lookahead blocks will FAIL to outside
3113 int32_t jmpDest
= URX_VAL(op
);
3114 if (jmpDest
> loc
) {
3115 if (currentLen
< forwardedLength
.elementAti(jmpDest
)) {
3116 forwardedLength
.setElementAt(currentLen
, jmpDest
);
3120 U_ASSERT(loc
<= end
);
3130 UPRV_UNREACHABLE
; // Shouldn't get here. These ops should be
3131 // consumed by the scan in URX_LA_START and LB_START
3139 // We have finished walking through the ops. Check whether some forward jump
3140 // propagated a shorter length to location end+1.
3141 if (forwardedLength
.elementAti(end
+1) < currentLen
) {
3142 currentLen
= forwardedLength
.elementAti(end
+1);
3146 fRXPat
->fInitialChars8
->init(fRXPat
->fInitialChars
);
3149 // Sort out what we should check for when looking for candidate match start positions.
3150 // In order of preference,
3151 // 1. Start of input text buffer.
3152 // 2. A literal string.
3153 // 3. Start of line in multi-line mode.
3154 // 4. A single literal character.
3155 // 5. A character from a set of characters.
3157 if (fRXPat
->fStartType
== START_START
) {
3158 // Match only at the start of an input text string.
3159 // start type is already set. We're done.
3160 } else if (numInitialStrings
== 1 && fRXPat
->fMinMatchLen
> 0) {
3161 // Match beginning only with a literal string.
3162 UChar32 c
= fRXPat
->fLiteralText
.char32At(fRXPat
->fInitialStringIdx
);
3163 U_ASSERT(fRXPat
->fInitialChars
->contains(c
));
3164 fRXPat
->fStartType
= START_STRING
;
3165 fRXPat
->fInitialChar
= c
;
3166 } else if (fRXPat
->fStartType
== START_LINE
) {
3167 // Match at start of line in Multi-Line mode.
3168 // Nothing to do here; everything is already set.
3169 } else if (fRXPat
->fMinMatchLen
== 0) {
3170 // Zero length match possible. We could start anywhere.
3171 fRXPat
->fStartType
= START_NO_INFO
;
3172 } else if (fRXPat
->fInitialChars
->size() == 1) {
3173 // All matches begin with the same char.
3174 fRXPat
->fStartType
= START_CHAR
;
3175 fRXPat
->fInitialChar
= fRXPat
->fInitialChars
->charAt(0);
3176 U_ASSERT(fRXPat
->fInitialChar
!= (UChar32
)-1);
3177 } else if (fRXPat
->fInitialChars
->contains((UChar32
)0, (UChar32
)0x10ffff) == FALSE
&&
3178 fRXPat
->fMinMatchLen
> 0) {
3179 // Matches start with a set of character smaller than the set of all chars.
3180 fRXPat
->fStartType
= START_SET
;
3182 // Matches can start with anything
3183 fRXPat
->fStartType
= START_NO_INFO
;
3191 //------------------------------------------------------------------------------
3193 // minMatchLength Calculate the length of the shortest string that could
3194 // match the specified pattern.
3195 // Length is in 16 bit code units, not code points.
3197 // The calculated length may not be exact. The returned
3198 // value may be shorter than the actual minimum; it must
3201 // start and end are the range of p-code operations to be
3202 // examined. The endpoints are included in the range.
3204 //------------------------------------------------------------------------------
3205 int32_t RegexCompile::minMatchLength(int32_t start
, int32_t end
) {
3206 if (U_FAILURE(*fStatus
)) {
3210 U_ASSERT(start
<= end
);
3211 U_ASSERT(end
< fRXPat
->fCompiledPat
->size());
3217 int32_t currentLen
= 0;
3220 // forwardedLength is a vector holding minimum-match-length values that
3221 // are propagated forward in the pattern by JMP or STATE_SAVE operations.
3222 // It must be one longer than the pattern being checked because some ops
3223 // will jmp to a end-of-block+1 location from within a block, and we must
3224 // count those when checking the block.
3225 UVector32
forwardedLength(end
+2, *fStatus
);
3226 forwardedLength
.setSize(end
+2);
3227 for (loc
=start
; loc
<=end
+1; loc
++) {
3228 forwardedLength
.setElementAt(INT32_MAX
, loc
);
3231 for (loc
= start
; loc
<=end
; loc
++) {
3232 op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
3233 opType
= URX_TYPE(op
);
3235 // The loop is advancing linearly through the pattern.
3236 // If the op we are now at was the destination of a branch in the pattern,
3237 // and that path has a shorter minimum length than the current accumulated value,
3238 // replace the current accumulated value.
3239 // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); // MinLength == INT32_MAX for some
3240 // no-match-possible cases.
3241 if (forwardedLength
.elementAti(loc
) < currentLen
) {
3242 currentLen
= forwardedLength
.elementAti(loc
);
3243 U_ASSERT(currentLen
>=0 && currentLen
< INT32_MAX
);
3247 // Ops that don't change the total length matched
3248 case URX_RESERVED_OP
:
3250 case URX_STRING_LEN
:
3252 case URX_START_CAPTURE
:
3253 case URX_END_CAPTURE
:
3254 case URX_BACKSLASH_B
:
3255 case URX_BACKSLASH_BU
:
3256 case URX_BACKSLASH_G
:
3257 case URX_BACKSLASH_Z
:
3263 case URX_RELOC_OPRND
:
3264 case URX_STO_INP_LOC
:
3266 case URX_CARET_M_UNIX
:
3267 case URX_BACKREF
: // BackRef. Must assume that it might be a zero length match
3270 case URX_STO_SP
: // Setup for atomic or possessive blocks. Doesn't change what can match.
3278 // Ops that match a minimum of one character (one or two 16 bit code units.)
3281 case URX_STATIC_SETREF
:
3282 case URX_STAT_SETREF_N
:
3284 case URX_BACKSLASH_D
:
3285 case URX_BACKSLASH_H
:
3286 case URX_BACKSLASH_R
:
3287 case URX_BACKSLASH_V
:
3289 case URX_BACKSLASH_X
: // Grahpeme Cluster. Minimum is 1, max unbounded.
3290 case URX_DOTANY_ALL
: // . matches one or two.
3292 case URX_DOTANY_UNIX
:
3293 currentLen
= safeIncrement(currentLen
, 1);
3298 loc
++; // URX_JMPX has an extra operand, ignored here,
3299 // otherwise processed identically to URX_JMP.
3303 int32_t jmpDest
= URX_VAL(op
);
3304 if (jmpDest
< loc
) {
3305 // Loop of some kind. Can safely ignore, the worst that will happen
3306 // is that we understate the true minimum length
3307 currentLen
= forwardedLength
.elementAti(loc
+1);
3309 // Forward jump. Propagate the current min length to the target loc of the jump.
3310 U_ASSERT(jmpDest
<= end
+1);
3311 if (forwardedLength
.elementAti(jmpDest
) > currentLen
) {
3312 forwardedLength
.setElementAt(currentLen
, jmpDest
);
3320 // Back-tracks are kind of like a branch, except that the min length was
3321 // propagated already, by the state save.
3322 currentLen
= forwardedLength
.elementAti(loc
+1);
3327 case URX_STATE_SAVE
:
3329 // State Save, for forward jumps, propagate the current minimum.
3330 // of the state save.
3331 int32_t jmpDest
= URX_VAL(op
);
3332 if (jmpDest
> loc
) {
3333 if (currentLen
< forwardedLength
.elementAti(jmpDest
)) {
3334 forwardedLength
.setElementAt(currentLen
, jmpDest
);
3344 int32_t stringLenOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
3345 currentLen
= safeIncrement(currentLen
, URX_VAL(stringLenOp
));
3353 // TODO: with full case folding, matching input text may be shorter than
3354 // the string we have here. More smarts could put some bounds on it.
3355 // Assume a min length of one for now. A min length of zero causes
3356 // optimization failures for a pattern like "string"+
3357 // currentLen += URX_VAL(stringLenOp);
3358 currentLen
= safeIncrement(currentLen
, 1);
3363 case URX_CTR_INIT_NG
:
3366 // If the min loop count == 0
3367 // move loc forwards to the end of the loop, skipping over the body.
3368 // If the min count is > 0,
3369 // continue normal processing of the body of the loop.
3370 int32_t loopEndLoc
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
+1);
3371 loopEndLoc
= URX_VAL(loopEndLoc
);
3372 int32_t minLoopCount
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
+2);
3373 if (minLoopCount
== 0) {
3376 loc
+=3; // Skips over operands of CTR_INIT
3383 case URX_CTR_LOOP_NG
:
3385 // The jump is conditional, backwards only.
3389 case URX_LOOP_DOT_I
:
3391 // More loop ops. These state-save to themselves.
3392 // don't change the minimum match - could match nothing at all.
3399 // Look-around. Scan forward until the matching look-ahead end,
3400 // without processing the look-around block. This is overly pessimistic for look-ahead,
3401 // it assumes that the look-ahead match might be zero-length.
3402 // TODO: Positive lookahead could recursively do the block, then continue
3403 // with the longer of the block or the value coming in. Ticket 6060
3404 int32_t depth
= (opType
== URX_LA_START
? 2: 1);
3407 op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
3408 if (URX_TYPE(op
) == URX_LA_START
) {
3409 // The boilerplate for look-ahead includes two LA_END insturctions,
3410 // Depth will be decremented by each one when it is seen.
3413 if (URX_TYPE(op
) == URX_LB_START
) {
3416 if (URX_TYPE(op
) == URX_LA_END
) {
3422 if (URX_TYPE(op
)==URX_LBN_END
) {
3428 if (URX_TYPE(op
) == URX_STATE_SAVE
) {
3429 // Need this because neg lookahead blocks will FAIL to outside
3431 int32_t jmpDest
= URX_VAL(op
);
3432 if (jmpDest
> loc
) {
3433 if (currentLen
< forwardedLength
.elementAti(jmpDest
)) {
3434 forwardedLength
.setElementAt(currentLen
, jmpDest
);
3438 U_ASSERT(loc
<= end
);
3448 // Only come here if the matching URX_LA_START or URX_LB_START was not in the
3449 // range being sized, which happens when measuring size of look-behind blocks.
3458 // We have finished walking through the ops. Check whether some forward jump
3459 // propagated a shorter length to location end+1.
3460 if (forwardedLength
.elementAti(end
+1) < currentLen
) {
3461 currentLen
= forwardedLength
.elementAti(end
+1);
3462 U_ASSERT(currentLen
>=0 && currentLen
< INT32_MAX
);
3468 //------------------------------------------------------------------------------
3470 // maxMatchLength Calculate the length of the longest string that could
3471 // match the specified pattern.
3472 // Length is in 16 bit code units, not code points.
3474 // The calculated length may not be exact. The returned
3475 // value may be longer than the actual maximum; it must
3476 // never be shorter.
3478 //------------------------------------------------------------------------------
3479 int32_t RegexCompile::maxMatchLength(int32_t start
, int32_t end
) {
3480 if (U_FAILURE(*fStatus
)) {
3483 U_ASSERT(start
<= end
);
3484 U_ASSERT(end
< fRXPat
->fCompiledPat
->size());
3489 int32_t currentLen
= 0;
3490 UVector32
forwardedLength(end
+1, *fStatus
);
3491 forwardedLength
.setSize(end
+1);
3493 for (loc
=start
; loc
<=end
; loc
++) {
3494 forwardedLength
.setElementAt(0, loc
);
3497 for (loc
= start
; loc
<=end
; loc
++) {
3498 op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
3499 opType
= URX_TYPE(op
);
3501 // The loop is advancing linearly through the pattern.
3502 // If the op we are now at was the destination of a branch in the pattern,
3503 // and that path has a longer maximum length than the current accumulated value,
3504 // replace the current accumulated value.
3505 if (forwardedLength
.elementAti(loc
) > currentLen
) {
3506 currentLen
= forwardedLength
.elementAti(loc
);
3510 // Ops that don't change the total length matched
3511 case URX_RESERVED_OP
:
3513 case URX_STRING_LEN
:
3515 case URX_START_CAPTURE
:
3516 case URX_END_CAPTURE
:
3517 case URX_BACKSLASH_B
:
3518 case URX_BACKSLASH_BU
:
3519 case URX_BACKSLASH_G
:
3520 case URX_BACKSLASH_Z
:
3526 case URX_RELOC_OPRND
:
3527 case URX_STO_INP_LOC
:
3529 case URX_CARET_M_UNIX
:
3531 case URX_STO_SP
: // Setup for atomic or possessive blocks. Doesn't change what can match.
3541 // Ops that increase that cause an unbounded increase in the length
3542 // of a matched string, or that increase it a hard to characterize way.
3543 // Call the max length unbounded, and stop further checking.
3544 case URX_BACKREF
: // BackRef. Must assume that it might be a zero length match
3546 case URX_BACKSLASH_X
: // Grahpeme Cluster. Minimum is 1, max unbounded.
3547 currentLen
= INT32_MAX
;
3551 // Ops that match a max of one character (possibly two 16 bit code units.)
3553 case URX_STATIC_SETREF
:
3554 case URX_STAT_SETREF_N
:
3556 case URX_BACKSLASH_D
:
3557 case URX_BACKSLASH_H
:
3558 case URX_BACKSLASH_R
:
3559 case URX_BACKSLASH_V
:
3561 case URX_DOTANY_ALL
:
3563 case URX_DOTANY_UNIX
:
3564 currentLen
= safeIncrement(currentLen
, 2);
3567 // Single literal character. Increase current max length by one or two,
3568 // depending on whether the char is in the supplementary range.
3570 currentLen
= safeIncrement(currentLen
, 1);
3571 if (URX_VAL(op
) > 0x10000) {
3572 currentLen
= safeIncrement(currentLen
, 1);
3583 int32_t jmpDest
= URX_VAL(op
);
3584 if (jmpDest
< loc
) {
3585 // Loop of some kind. Max match length is unbounded.
3586 currentLen
= INT32_MAX
;
3588 // Forward jump. Propagate the current min length to the target loc of the jump.
3589 if (forwardedLength
.elementAti(jmpDest
) < currentLen
) {
3590 forwardedLength
.setElementAt(currentLen
, jmpDest
);
3598 // back-tracks are kind of like a branch, except that the max length was
3599 // propagated already, by the state save.
3600 currentLen
= forwardedLength
.elementAti(loc
+1);
3604 case URX_STATE_SAVE
:
3606 // State Save, for forward jumps, propagate the current minimum.
3607 // of the state save.
3608 // For backwards jumps, they create a loop, maximum
3609 // match length is unbounded.
3610 int32_t jmpDest
= URX_VAL(op
);
3611 if (jmpDest
> loc
) {
3612 if (currentLen
> forwardedLength
.elementAti(jmpDest
)) {
3613 forwardedLength
.setElementAt(currentLen
, jmpDest
);
3616 currentLen
= INT32_MAX
;
3627 int32_t stringLenOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
3628 currentLen
= safeIncrement(currentLen
, URX_VAL(stringLenOp
));
3633 // TODO: This code assumes that any user string that matches will be no longer
3634 // than our compiled string, with case insensitive matching.
3635 // Our compiled string has been case-folded already.
3637 // Any matching user string will have no more code points than our
3638 // compiled (folded) string. Folding may add code points, but
3641 // There is a potential problem if a supplemental code point
3642 // case-folds to a BMP code point. In this case our compiled string
3643 // could be shorter (in code units) than a matching user string.
3645 // At this time (Unicode 6.1) there are no such characters, and this case
3646 // is not being handled. A test, intltest regex/Bug9283, will fail if
3647 // any problematic characters are added to Unicode.
3649 // If this happens, we can make a set of the BMP chars that the
3650 // troublesome supplementals fold to, scan our string, and bump the
3651 // currentLen one extra for each that is found.
3655 int32_t stringLenOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
3656 currentLen
= safeIncrement(currentLen
, URX_VAL(stringLenOp
));
3661 case URX_CTR_INIT_NG
:
3662 // For Loops, recursively call this function on the pattern for the loop body,
3663 // then multiply the result by the maximum loop count.
3665 int32_t loopEndLoc
= URX_VAL(fRXPat
->fCompiledPat
->elementAti(loc
+1));
3666 if (loopEndLoc
== loc
+4) {
3667 // Loop has an empty body. No affect on max match length.
3668 // Continue processing with code after the loop end.
3673 int32_t maxLoopCount
= static_cast<int32_t>(fRXPat
->fCompiledPat
->elementAti(loc
+3));
3674 if (maxLoopCount
== -1) {
3675 // Unbounded Loop. No upper bound on match length.
3676 currentLen
= INT32_MAX
;
3680 U_ASSERT(loopEndLoc
>= loc
+4);
3681 int64_t blockLen
= maxMatchLength(loc
+4, loopEndLoc
-1); // Recursive call.
3682 int64_t updatedLen
= (int64_t)currentLen
+ blockLen
* maxLoopCount
;
3683 if (updatedLen
>= INT32_MAX
) {
3684 currentLen
= INT32_MAX
;
3687 currentLen
= (int32_t)updatedLen
;
3693 case URX_CTR_LOOP_NG
:
3694 // These opcodes will be skipped over by code for URX_CTR_INIT.
3695 // We shouldn't encounter them here.
3699 case URX_LOOP_DOT_I
:
3701 // For anything to do with loops, make the match length unbounded.
3702 currentLen
= INT32_MAX
;
3709 // Look-ahead. Just ignore, treat the look-ahead block as if
3710 // it were normal pattern. Gives a too-long match length,
3711 // but good enough for now.
3714 // End of look-ahead ops should always be consumed by the processing at
3715 // the URX_LA_START op.
3716 // UPRV_UNREACHABLE;
3720 // Look-behind. Scan forward until the matching look-around end,
3721 // without processing the look-behind block.
3722 int32_t dataLoc
= URX_VAL(op
);
3723 for (loc
= loc
+ 1; loc
< end
; ++loc
) {
3724 op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
3725 int32_t opType
= URX_TYPE(op
);
3726 if ((opType
== URX_LA_END
|| opType
== URX_LBN_END
) && (URX_VAL(op
) == dataLoc
)) {
3730 U_ASSERT(loc
< end
);
3739 if (currentLen
== INT32_MAX
) {
3740 // The maximum length is unbounded.
3741 // Stop further processing of the pattern.
3751 //------------------------------------------------------------------------------
3753 // stripNOPs Remove any NOP operations from the compiled pattern code.
3754 // Extra NOPs are inserted for some constructs during the initial
3755 // code generation to provide locations that may be patched later.
3756 // Many end up unneeded, and are removed by this function.
3758 // In order to minimize the number of passes through the pattern,
3759 // back-reference fixup is also performed here (adjusting
3760 // back-reference operands to point to the correct frame offsets).
3762 //------------------------------------------------------------------------------
3763 void RegexCompile::stripNOPs() {
3765 if (U_FAILURE(*fStatus
)) {
3769 int32_t end
= fRXPat
->fCompiledPat
->size();
3770 UVector32
deltas(end
, *fStatus
);
3772 // Make a first pass over the code, computing the amount that things
3773 // will be offset at each location in the original code.
3776 for (loc
=0; loc
<end
; loc
++) {
3777 deltas
.addElement(d
, *fStatus
);
3778 int32_t op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
3779 if (URX_TYPE(op
) == URX_NOP
) {
3784 UnicodeString caseStringBuffer
;
3786 // Make a second pass over the code, removing the NOPs by moving following
3787 // code up, and patching operands that refer to code locations that
3788 // are being moved. The array of offsets from the first step is used
3789 // to compute the new operand values.
3792 for (src
=0; src
<end
; src
++) {
3793 int32_t op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(src
);
3794 int32_t opType
= URX_TYPE(op
);
3799 case URX_STATE_SAVE
:
3802 case URX_CTR_LOOP_NG
:
3803 case URX_RELOC_OPRND
:
3807 // These are instructions with operands that refer to code locations.
3809 int32_t operandAddress
= URX_VAL(op
);
3810 U_ASSERT(operandAddress
>=0 && operandAddress
<deltas
.size());
3811 int32_t fixedOperandAddress
= operandAddress
- deltas
.elementAti(operandAddress
);
3812 op
= buildOp(opType
, fixedOperandAddress
);
3813 fRXPat
->fCompiledPat
->setElementAt(op
, dst
);
3821 int32_t where
= URX_VAL(op
);
3822 if (where
> fRXPat
->fGroupMap
->size()) {
3823 error(U_REGEX_INVALID_BACK_REF
);
3826 where
= fRXPat
->fGroupMap
->elementAti(where
-1);
3827 op
= buildOp(opType
, where
);
3828 fRXPat
->fCompiledPat
->setElementAt(op
, dst
);
3831 fRXPat
->fNeedsAltInput
= TRUE
;
3834 case URX_RESERVED_OP
:
3835 case URX_RESERVED_OP_N
:
3840 case URX_STRING_LEN
:
3841 case URX_START_CAPTURE
:
3842 case URX_END_CAPTURE
:
3843 case URX_STATIC_SETREF
:
3844 case URX_STAT_SETREF_N
:
3848 case URX_BACKSLASH_B
:
3849 case URX_BACKSLASH_BU
:
3850 case URX_BACKSLASH_G
:
3851 case URX_BACKSLASH_X
:
3852 case URX_BACKSLASH_Z
:
3853 case URX_DOTANY_ALL
:
3854 case URX_BACKSLASH_D
:
3858 case URX_CTR_INIT_NG
:
3859 case URX_DOTANY_UNIX
:
3862 case URX_STO_INP_LOC
:
3869 case URX_CARET_M_UNIX
:
3876 case URX_LOOP_DOT_I
:
3880 case URX_BACKSLASH_H
:
3881 case URX_BACKSLASH_R
:
3882 case URX_BACKSLASH_V
:
3883 // These instructions are unaltered by the relocation.
3884 fRXPat
->fCompiledPat
->setElementAt(op
, dst
);
3889 // Some op is unaccounted for.
3894 fRXPat
->fCompiledPat
->setSize(dst
);
3900 //------------------------------------------------------------------------------
3902 // Error Report a rule parse error.
3903 // Only report it if no previous error has been recorded.
3905 //------------------------------------------------------------------------------
3906 void RegexCompile::error(UErrorCode e
) {
3907 if (U_SUCCESS(*fStatus
) || e
== U_MEMORY_ALLOCATION_ERROR
) {
3909 // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
3910 // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
3911 // int64_t. If the values of the latter are out of range for the former,
3912 // set them to the appropriate "field not supported" values.
3913 if (fLineNum
> 0x7FFFFFFF) {
3914 fParseErr
->line
= 0;
3915 fParseErr
->offset
= -1;
3916 } else if (fCharNum
> 0x7FFFFFFF) {
3917 fParseErr
->line
= (int32_t)fLineNum
;
3918 fParseErr
->offset
= -1;
3920 fParseErr
->line
= (int32_t)fLineNum
;
3921 fParseErr
->offset
= (int32_t)fCharNum
;
3924 UErrorCode status
= U_ZERO_ERROR
; // throwaway status for extracting context
3926 // Fill in the context.
3927 // Note: extractBetween() pins supplied indicies to the string bounds.
3928 uprv_memset(fParseErr
->preContext
, 0, sizeof(fParseErr
->preContext
));
3929 uprv_memset(fParseErr
->postContext
, 0, sizeof(fParseErr
->postContext
));
3930 utext_extract(fRXPat
->fPattern
, fScanIndex
-U_PARSE_CONTEXT_LEN
+1, fScanIndex
, fParseErr
->preContext
, U_PARSE_CONTEXT_LEN
, &status
);
3931 utext_extract(fRXPat
->fPattern
, fScanIndex
, fScanIndex
+U_PARSE_CONTEXT_LEN
-1, fParseErr
->postContext
, U_PARSE_CONTEXT_LEN
, &status
);
3937 // Assorted Unicode character constants.
3938 // Numeric because there is no portable way to enter them as literals.
3941 static const UChar chCR
= 0x0d; // New lines, for terminating comments.
3942 static const UChar chLF
= 0x0a; // Line Feed
3943 static const UChar chPound
= 0x23; // '#', introduces a comment.
3944 static const UChar chDigit0
= 0x30; // '0'
3945 static const UChar chDigit7
= 0x37; // '9'
3946 static const UChar chColon
= 0x3A; // ':'
3947 static const UChar chE
= 0x45; // 'E'
3948 static const UChar chQ
= 0x51; // 'Q'
3949 //static const UChar chN = 0x4E; // 'N'
3950 static const UChar chP
= 0x50; // 'P'
3951 static const UChar chBackSlash
= 0x5c; // '\' introduces a char escape
3952 //static const UChar chLBracket = 0x5b; // '['
3953 static const UChar chRBracket
= 0x5d; // ']'
3954 static const UChar chUp
= 0x5e; // '^'
3955 static const UChar chLowerP
= 0x70;
3956 static const UChar chLBrace
= 0x7b; // '{'
3957 static const UChar chRBrace
= 0x7d; // '}'
3958 static const UChar chNEL
= 0x85; // NEL newline variant
3959 static const UChar chLS
= 0x2028; // Unicode Line Separator
3962 //------------------------------------------------------------------------------
3964 // nextCharLL Low Level Next Char from the regex pattern.
3965 // Get a char from the string, keep track of input position
3966 // for error reporting.
3968 //------------------------------------------------------------------------------
3969 UChar32
RegexCompile::nextCharLL() {
3972 if (fPeekChar
!= -1) {
3978 // assume we're already in the right place
3979 ch
= UTEXT_NEXT32(fRXPat
->fPattern
);
3980 if (ch
== U_SENTINEL
) {
3987 (ch
== chLF
&& fLastChar
!= chCR
)) {
3988 // Character is starting a new line. Bump up the line number, and
3989 // reset the column to 0.
3994 // Character is not starting a new line. Except in the case of a
3995 // LF following a CR, increment the column position.
4004 //------------------------------------------------------------------------------
4006 // peekCharLL Low Level Character Scanning, sneak a peek at the next
4007 // character without actually getting it.
4009 //------------------------------------------------------------------------------
4010 UChar32
RegexCompile::peekCharLL() {
4011 if (fPeekChar
== -1) {
4012 fPeekChar
= nextCharLL();
4018 //------------------------------------------------------------------------------
4020 // nextChar for pattern scanning. At this level, we handle stripping
4021 // out comments and processing some backslash character escapes.
4022 // The rest of the pattern grammar is handled at the next level up.
4024 //------------------------------------------------------------------------------
4025 void RegexCompile::nextChar(RegexPatternChar
&c
) {
4027 fScanIndex
= UTEXT_GETNATIVEINDEX(fRXPat
->fPattern
);
4028 c
.fChar
= nextCharLL();
4033 if ((c
.fChar
==chBackSlash
&& peekCharLL()==chE
&& ((fModeFlags
& UREGEX_LITERAL
) == 0)) ||
4034 c
.fChar
== (UChar32
)-1) {
4035 fQuoteMode
= FALSE
; // Exit quote mode,
4036 nextCharLL(); // discard the E
4037 // nextChar(c); // recurse to get the real next char
4038 goto tailRecursion
; // Note: fuzz testing produced testcases that
4039 // resulted in stack overflow here.
4042 else if (fInBackslashQuote
) {
4043 // The current character immediately follows a '\'
4044 // Don't check for any further escapes, just return it as-is.
4045 // Don't set c.fQuoted, because that would prevent the state machine from
4046 // dispatching on the character.
4047 fInBackslashQuote
= FALSE
;
4051 // We are not in a \Q quoted region \E of the source.
4053 if (fModeFlags
& UREGEX_COMMENTS
) {
4055 // We are in free-spacing and comments mode.
4056 // Scan through any white space and comments, until we
4057 // reach a significant character or the end of inut.
4059 if (c
.fChar
== (UChar32
)-1) {
4060 break; // End of Input
4062 if (c
.fChar
== chPound
&& fEOLComments
== TRUE
) {
4063 // Start of a comment. Consume the rest of it, until EOF or a new line
4065 c
.fChar
= nextCharLL();
4066 if (c
.fChar
== (UChar32
)-1 || // EOF
4075 // TODO: check what Java & Perl do with non-ASCII white spaces. Ticket 6061.
4076 if (PatternProps::isWhiteSpace(c
.fChar
) == FALSE
) {
4079 c
.fChar
= nextCharLL();
4084 // check for backslash escaped characters.
4086 if (c
.fChar
== chBackSlash
) {
4087 int64_t pos
= UTEXT_GETNATIVEINDEX(fRXPat
->fPattern
);
4088 if (RegexStaticSets::gStaticSets
->fUnescapeCharSet
.contains(peekCharLL())) {
4090 // A '\' sequence that is handled by ICU's standard unescapeAt function.
4091 // Includes \uxxxx, \n, \r, many others.
4092 // Return the single equivalent character.
4094 nextCharLL(); // get & discard the peeked char.
4097 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat
->fPattern
, fPatternLength
)) {
4098 int32_t endIndex
= (int32_t)pos
;
4099 c
.fChar
= u_unescapeAt(uregex_ucstr_unescape_charAt
, &endIndex
, (int32_t)fPatternLength
, (void *)fRXPat
->fPattern
->chunkContents
);
4101 if (endIndex
== pos
) {
4102 error(U_REGEX_BAD_ESCAPE_SEQUENCE
);
4104 fCharNum
+= endIndex
- pos
;
4105 UTEXT_SETNATIVEINDEX(fRXPat
->fPattern
, endIndex
);
4108 struct URegexUTextUnescapeCharContext context
= U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat
->fPattern
);
4110 UTEXT_SETNATIVEINDEX(fRXPat
->fPattern
, pos
);
4111 c
.fChar
= u_unescapeAt(uregex_utext_unescape_charAt
, &offset
, INT32_MAX
, &context
);
4114 error(U_REGEX_BAD_ESCAPE_SEQUENCE
);
4115 } else if (context
.lastOffset
== offset
) {
4116 UTEXT_PREVIOUS32(fRXPat
->fPattern
);
4117 } else if (context
.lastOffset
!= offset
-1) {
4118 utext_moveIndex32(fRXPat
->fPattern
, offset
- context
.lastOffset
- 1);
4123 else if (peekCharLL() == chDigit0
) {
4124 // Octal Escape, using Java Regexp Conventions
4125 // which are \0 followed by 1-3 octal digits.
4126 // Different from ICU Unescape handling of Octal, which does not
4127 // require the leading 0.
4128 // Java also has the convention of only consuming 2 octal digits if
4129 // the three digit number would be > 0xff
4132 nextCharLL(); // Consume the initial 0.
4134 for (index
=0; index
<3; index
++) {
4135 int32_t ch
= peekCharLL();
4136 if (ch
<chDigit0
|| ch
>chDigit7
) {
4138 // \0 is not followed by any octal digits.
4139 error(U_REGEX_BAD_ESCAPE_SEQUENCE
);
4145 if (c
.fChar
<= 255) {
4148 // The last digit made the number too big. Forget we saw it.
4154 else if (peekCharLL() == chQ
) {
4155 // "\Q" enter quote mode, which will continue until "\E"
4157 nextCharLL(); // discard the 'Q'.
4158 // nextChar(c); // recurse to get the real next char.
4159 goto tailRecursion
; // Note: fuzz testing produced test cases that
4160 // resulted in stack overflow here.
4164 // We are in a '\' escape that will be handled by the state table scanner.
4165 // Just return the backslash, but remember that the following char is to
4166 // be taken literally.
4167 fInBackslashQuote
= TRUE
;
4172 // re-enable # to end-of-line comments, in case they were disabled.
4173 // They are disabled by the parser upon seeing '(?', but this lasts for
4174 // the fetching of the next character only.
4175 fEOLComments
= TRUE
;
4177 // putc(c.fChar, stdout);
4182 //------------------------------------------------------------------------------
4185 // Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
4187 // The scan position will be at the 'N'. On return
4188 // the scan position should be just after the '}'
4190 // Return the UChar32
4192 //------------------------------------------------------------------------------
4193 UChar32
RegexCompile::scanNamedChar() {
4194 if (U_FAILURE(*fStatus
)) {
4199 if (fC
.fChar
!= chLBrace
) {
4200 error(U_REGEX_PROPERTY_SYNTAX
);
4204 UnicodeString charName
;
4207 if (fC
.fChar
== chRBrace
) {
4210 if (fC
.fChar
== -1) {
4211 error(U_REGEX_PROPERTY_SYNTAX
);
4214 charName
.append(fC
.fChar
);
4218 if (!uprv_isInvariantUString(charName
.getBuffer(), charName
.length()) ||
4219 (uint32_t)charName
.length()>=sizeof(name
)) {
4220 // All Unicode character names have only invariant characters.
4221 // The API to get a character, given a name, accepts only char *, forcing us to convert,
4222 // which requires this error check
4223 error(U_REGEX_PROPERTY_SYNTAX
);
4226 charName
.extract(0, charName
.length(), name
, sizeof(name
), US_INV
);
4228 UChar32 theChar
= u_charFromName(U_UNICODE_CHAR_NAME
, name
, fStatus
);
4229 if (U_FAILURE(*fStatus
)) {
4230 error(U_REGEX_PROPERTY_SYNTAX
);
4233 nextChar(fC
); // Continue overall regex pattern processing with char after the '}'
4237 //------------------------------------------------------------------------------
4239 // scanProp Construct a UnicodeSet from the text at the current scan
4240 // position, which will be of the form \p{whaterver}
4242 // The scan position will be at the 'p' or 'P'. On return
4243 // the scan position should be just after the '}'
4245 // Return a UnicodeSet, constructed from the \P pattern,
4246 // or NULL if the pattern is invalid.
4248 //------------------------------------------------------------------------------
4249 UnicodeSet
*RegexCompile::scanProp() {
4250 UnicodeSet
*uset
= NULL
;
4252 if (U_FAILURE(*fStatus
)) {
4255 (void)chLowerP
; // Suppress compiler unused variable warning.
4256 U_ASSERT(fC
.fChar
== chLowerP
|| fC
.fChar
== chP
);
4257 UBool negated
= (fC
.fChar
== chP
);
4259 UnicodeString propertyName
;
4261 if (fC
.fChar
!= chLBrace
) {
4262 error(U_REGEX_PROPERTY_SYNTAX
);
4267 if (fC
.fChar
== chRBrace
) {
4270 if (fC
.fChar
== -1) {
4271 // Hit the end of the input string without finding the closing '}'
4272 error(U_REGEX_PROPERTY_SYNTAX
);
4275 propertyName
.append(fC
.fChar
);
4277 uset
= createSetForProperty(propertyName
, negated
);
4278 nextChar(fC
); // Move input scan to position following the closing '}'
4282 //------------------------------------------------------------------------------
4284 // scanPosixProp Construct a UnicodeSet from the text at the current scan
4285 // position, which is expected be of the form [:property expression:]
4287 // The scan position will be at the opening ':'. On return
4288 // the scan position must be on the closing ']'
4290 // Return a UnicodeSet constructed from the pattern,
4291 // or NULL if this is not a valid POSIX-style set expression.
4292 // If not a property expression, restore the initial scan position
4293 // (to the opening ':')
4295 // Note: the opening '[:' is not sufficient to guarantee that
4296 // this is a [:property:] expression.
4297 // [:'+=,] is a perfectly good ordinary set expression that
4298 // happens to include ':' as one of its characters.
4300 //------------------------------------------------------------------------------
4301 UnicodeSet
*RegexCompile::scanPosixProp() {
4302 UnicodeSet
*uset
= NULL
;
4304 if (U_FAILURE(*fStatus
)) {
4308 U_ASSERT(fC
.fChar
== chColon
);
4310 // Save the scanner state.
4311 // TODO: move this into the scanner, with the state encapsulated in some way. Ticket 6062
4312 int64_t savedScanIndex
= fScanIndex
;
4313 int64_t savedNextIndex
= UTEXT_GETNATIVEINDEX(fRXPat
->fPattern
);
4314 UBool savedQuoteMode
= fQuoteMode
;
4315 UBool savedInBackslashQuote
= fInBackslashQuote
;
4316 UBool savedEOLComments
= fEOLComments
;
4317 int64_t savedLineNum
= fLineNum
;
4318 int64_t savedCharNum
= fCharNum
;
4319 UChar32 savedLastChar
= fLastChar
;
4320 UChar32 savedPeekChar
= fPeekChar
;
4321 RegexPatternChar savedfC
= fC
;
4323 // Scan for a closing ]. A little tricky because there are some perverse
4324 // edge cases possible. "[:abc\Qdef:] \E]" is a valid non-property expression,
4325 // ending on the second closing ].
4327 UnicodeString propName
;
4328 UBool negated
= FALSE
;
4330 // Check for and consume the '^' in a negated POSIX property, e.g. [:^Letter:]
4332 if (fC
.fChar
== chUp
) {
4337 // Scan for the closing ":]", collecting the property name along the way.
4338 UBool sawPropSetTerminator
= FALSE
;
4340 propName
.append(fC
.fChar
);
4342 if (fC
.fQuoted
|| fC
.fChar
== -1) {
4343 // Escaped characters or end of input - either says this isn't a [:Property:]
4346 if (fC
.fChar
== chColon
) {
4348 if (fC
.fChar
== chRBracket
) {
4349 sawPropSetTerminator
= TRUE
;
4355 if (sawPropSetTerminator
) {
4356 uset
= createSetForProperty(propName
, negated
);
4361 // Restore the original scan position.
4362 // The main scanner will retry the input as a normal set expression,
4363 // not a [:Property:] expression.
4364 fScanIndex
= savedScanIndex
;
4365 fQuoteMode
= savedQuoteMode
;
4366 fInBackslashQuote
= savedInBackslashQuote
;
4367 fEOLComments
= savedEOLComments
;
4368 fLineNum
= savedLineNum
;
4369 fCharNum
= savedCharNum
;
4370 fLastChar
= savedLastChar
;
4371 fPeekChar
= savedPeekChar
;
4373 UTEXT_SETNATIVEINDEX(fRXPat
->fPattern
, savedNextIndex
);
4378 static inline void addIdentifierIgnorable(UnicodeSet
*set
, UErrorCode
& ec
) {
4379 set
->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
4380 addCategory(set
, U_GC_CF_MASK
, ec
);
4384 // Create a Unicode Set from a Unicode Property expression.
4385 // This is common code underlying both \p{...} ane [:...:] expressions.
4386 // Includes trying the Java "properties" that aren't supported as
4387 // normal ICU UnicodeSet properties
4389 UnicodeSet
*RegexCompile::createSetForProperty(const UnicodeString
&propName
, UBool negated
) {
4391 if (U_FAILURE(*fStatus
)) {
4394 LocalPointer
<UnicodeSet
> set
;
4395 UErrorCode status
= U_ZERO_ERROR
;
4397 do { // non-loop, exists to allow breaks from the block.
4399 // First try the property as we received it
4401 UnicodeString setExpr
;
4402 uint32_t usetFlags
= 0;
4403 setExpr
.append(u
"[\\p{", -1);
4404 setExpr
.append(propName
);
4405 setExpr
.append(u
"}]", -1);
4406 if (fModeFlags
& UREGEX_CASE_INSENSITIVE
) {
4407 usetFlags
|= USET_CASE_INSENSITIVE
;
4409 set
.adoptInsteadAndCheckErrorCode(new UnicodeSet(setExpr
, usetFlags
, NULL
, status
), status
);
4410 if (U_SUCCESS(status
) || status
== U_MEMORY_ALLOCATION_ERROR
) {
4415 // The incoming property wasn't directly recognized by ICU.
4417 // Check [:word:] and [:all:]. These are not recognized as a properties by ICU UnicodeSet.
4418 // Java accepts 'word' with mixed case.
4419 // Java accepts 'all' only in all lower case.
4421 status
= U_ZERO_ERROR
;
4422 if (propName
.caseCompare(u
"word", -1, 0) == 0) {
4423 set
.adoptInsteadAndCheckErrorCode(new UnicodeSet(*(fRXPat
->fStaticSets
[URX_ISWORD_SET
])), status
);
4426 if (propName
.compare(u
"all", -1) == 0) {
4427 set
.adoptInsteadAndCheckErrorCode(new UnicodeSet(0, 0x10ffff), status
);
4432 // Do Java InBlock expressions
4434 UnicodeString mPropName
= propName
;
4435 if (mPropName
.startsWith(u
"In", 2) && mPropName
.length() >= 3) {
4436 status
= U_ZERO_ERROR
;
4437 set
.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status
);
4438 if (U_FAILURE(status
)) {
4441 UnicodeString
blockName(mPropName
, 2); // Property with the leading "In" removed.
4442 set
->applyPropertyAlias(UnicodeString(u
"Block"), blockName
, status
);
4446 // Check for the Java form "IsBooleanPropertyValue", which we will recast
4447 // as "BooleanPropertyValue". The property value can be either a
4448 // a General Category or a Script Name.
4450 if (propName
.startsWith(u
"Is", 2) && propName
.length()>=3) {
4451 mPropName
.remove(0, 2); // Strip the "Is"
4452 if (mPropName
.indexOf(u
'=') >= 0) {
4453 // Reject any "Is..." property expression containing an '=', that is,
4454 // any non-binary property expression.
4455 status
= U_REGEX_PROPERTY_SYNTAX
;
4459 if (mPropName
.caseCompare(u
"assigned", -1, 0) == 0) {
4460 mPropName
.setTo(u
"unassigned", -1);
4462 } else if (mPropName
.caseCompare(u
"TitleCase", -1, 0) == 0) {
4463 mPropName
.setTo(u
"Titlecase_Letter", -1);
4466 mPropName
.insert(0, u
"[\\p{", -1);
4467 mPropName
.append(u
"}]", -1);
4468 set
.adoptInsteadAndCheckErrorCode(new UnicodeSet(mPropName
, *fStatus
), status
);
4470 if (U_SUCCESS(status
) && !set
->isEmpty() && (usetFlags
& USET_CASE_INSENSITIVE
)) {
4471 set
->closeOver(USET_CASE_INSENSITIVE
);
4477 if (propName
.startsWith(u
"java", -1)) {
4478 status
= U_ZERO_ERROR
;
4479 set
.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status
);
4480 if (U_FAILURE(status
)) {
4484 // Try the various Java specific properties.
4485 // These all begin with "java"
4487 if (propName
.compare(u
"javaDefined", -1) == 0) {
4488 addCategory(set
.getAlias(), U_GC_CN_MASK
, status
);
4491 else if (propName
.compare(u
"javaDigit", -1) == 0) {
4492 addCategory(set
.getAlias(), U_GC_ND_MASK
, status
);
4494 else if (propName
.compare(u
"javaIdentifierIgnorable", -1) == 0) {
4495 addIdentifierIgnorable(set
.getAlias(), status
);
4497 else if (propName
.compare(u
"javaISOControl", -1) == 0) {
4498 set
->add(0, 0x1F).add(0x7F, 0x9F);
4500 else if (propName
.compare(u
"javaJavaIdentifierPart", -1) == 0) {
4501 addCategory(set
.getAlias(), U_GC_L_MASK
, status
);
4502 addCategory(set
.getAlias(), U_GC_SC_MASK
, status
);
4503 addCategory(set
.getAlias(), U_GC_PC_MASK
, status
);
4504 addCategory(set
.getAlias(), U_GC_ND_MASK
, status
);
4505 addCategory(set
.getAlias(), U_GC_NL_MASK
, status
);
4506 addCategory(set
.getAlias(), U_GC_MC_MASK
, status
);
4507 addCategory(set
.getAlias(), U_GC_MN_MASK
, status
);
4508 addIdentifierIgnorable(set
.getAlias(), status
);
4510 else if (propName
.compare(u
"javaJavaIdentifierStart", -1) == 0) {
4511 addCategory(set
.getAlias(), U_GC_L_MASK
, status
);
4512 addCategory(set
.getAlias(), U_GC_NL_MASK
, status
);
4513 addCategory(set
.getAlias(), U_GC_SC_MASK
, status
);
4514 addCategory(set
.getAlias(), U_GC_PC_MASK
, status
);
4516 else if (propName
.compare(u
"javaLetter", -1) == 0) {
4517 addCategory(set
.getAlias(), U_GC_L_MASK
, status
);
4519 else if (propName
.compare(u
"javaLetterOrDigit", -1) == 0) {
4520 addCategory(set
.getAlias(), U_GC_L_MASK
, status
);
4521 addCategory(set
.getAlias(), U_GC_ND_MASK
, status
);
4523 else if (propName
.compare(u
"javaLowerCase", -1) == 0) {
4524 addCategory(set
.getAlias(), U_GC_LL_MASK
, status
);
4526 else if (propName
.compare(u
"javaMirrored", -1) == 0) {
4527 set
->applyIntPropertyValue(UCHAR_BIDI_MIRRORED
, 1, status
);
4529 else if (propName
.compare(u
"javaSpaceChar", -1) == 0) {
4530 addCategory(set
.getAlias(), U_GC_Z_MASK
, status
);
4532 else if (propName
.compare(u
"javaSupplementaryCodePoint", -1) == 0) {
4533 set
->add(0x10000, UnicodeSet::MAX_VALUE
);
4535 else if (propName
.compare(u
"javaTitleCase", -1) == 0) {
4536 addCategory(set
.getAlias(), U_GC_LT_MASK
, status
);
4538 else if (propName
.compare(u
"javaUnicodeIdentifierStart", -1) == 0) {
4539 addCategory(set
.getAlias(), U_GC_L_MASK
, status
);
4540 addCategory(set
.getAlias(), U_GC_NL_MASK
, status
);
4542 else if (propName
.compare(u
"javaUnicodeIdentifierPart", -1) == 0) {
4543 addCategory(set
.getAlias(), U_GC_L_MASK
, status
);
4544 addCategory(set
.getAlias(), U_GC_PC_MASK
, status
);
4545 addCategory(set
.getAlias(), U_GC_ND_MASK
, status
);
4546 addCategory(set
.getAlias(), U_GC_NL_MASK
, status
);
4547 addCategory(set
.getAlias(), U_GC_MC_MASK
, status
);
4548 addCategory(set
.getAlias(), U_GC_MN_MASK
, status
);
4549 addIdentifierIgnorable(set
.getAlias(), status
);
4551 else if (propName
.compare(u
"javaUpperCase", -1) == 0) {
4552 addCategory(set
.getAlias(), U_GC_LU_MASK
, status
);
4554 else if (propName
.compare(u
"javaValidCodePoint", -1) == 0) {
4555 set
->add(0, UnicodeSet::MAX_VALUE
);
4557 else if (propName
.compare(u
"javaWhitespace", -1) == 0) {
4558 addCategory(set
.getAlias(), U_GC_Z_MASK
, status
);
4559 set
->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
4560 set
->add(9, 0x0d).add(0x1c, 0x1f);
4562 status
= U_REGEX_PROPERTY_SYNTAX
;
4565 if (U_SUCCESS(status
) && !set
->isEmpty() && (usetFlags
& USET_CASE_INSENSITIVE
)) {
4566 set
->closeOver(USET_CASE_INSENSITIVE
);
4571 // Unrecognized property. ICU didn't like it as it was, and none of the Java compatibility
4572 // extensions matched it.
4573 status
= U_REGEX_PROPERTY_SYNTAX
;
4574 } while (false); // End of do loop block. Code above breaks out of the block on success or hard failure.
4576 if (U_SUCCESS(status
)) {
4577 U_ASSERT(set
.isValid());
4581 return set
.orphan();
4583 if (status
== U_ILLEGAL_ARGUMENT_ERROR
) {
4584 status
= U_REGEX_PROPERTY_SYNTAX
;
4593 // SetEval Part of the evaluation of [set expressions].
4594 // Perform any pending (stacked) operations with precedence
4595 // equal or greater to that of the next operator encountered
4596 // in the expression.
4598 void RegexCompile::setEval(int32_t nextOp
) {
4599 UnicodeSet
*rightOperand
= NULL
;
4600 UnicodeSet
*leftOperand
= NULL
;
4602 U_ASSERT(fSetOpStack
.empty()==FALSE
);
4603 int32_t pendingSetOperation
= fSetOpStack
.peeki();
4604 if ((pendingSetOperation
&0xffff0000) < (nextOp
&0xffff0000)) {
4608 U_ASSERT(fSetStack
.empty() == FALSE
);
4609 rightOperand
= (UnicodeSet
*)fSetStack
.peek();
4610 switch (pendingSetOperation
) {
4612 rightOperand
->complement();
4615 // TODO: need a simple close function. Ticket 6065
4616 rightOperand
->closeOver(USET_CASE_INSENSITIVE
);
4617 rightOperand
->removeAllStrings();
4619 case setDifference1
:
4620 case setDifference2
:
4622 leftOperand
= (UnicodeSet
*)fSetStack
.peek();
4623 leftOperand
->removeAll(*rightOperand
);
4624 delete rightOperand
;
4626 case setIntersection1
:
4627 case setIntersection2
:
4629 leftOperand
= (UnicodeSet
*)fSetStack
.peek();
4630 leftOperand
->retainAll(*rightOperand
);
4631 delete rightOperand
;
4635 leftOperand
= (UnicodeSet
*)fSetStack
.peek();
4636 leftOperand
->addAll(*rightOperand
);
4637 delete rightOperand
;
4645 void RegexCompile::setPushOp(int32_t op
) {
4647 fSetOpStack
.push(op
, *fStatus
);
4648 fSetStack
.push(new UnicodeSet(), *fStatus
);
4652 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS