1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
6 // Copyright (C) 2002-2016 International Business Machines Corporation and others.
7 // All Rights Reserved.
9 // This file contains the ICU regular expression compiler, which is responsible
10 // for processing a regular expression pattern into the compiled form that
11 // is used by the match finding engine.
14 #include "unicode/utypes.h"
16 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
18 #include "unicode/ustring.h"
19 #include "unicode/unistr.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/uchriter.h"
23 #include "unicode/parsepos.h"
24 #include "unicode/parseerr.h"
25 #include "unicode/regex.h"
26 #include "unicode/utf.h"
27 #include "unicode/utf16.h"
28 #include "patternprops.h"
38 #include "regexcst.h" // Contains state table for the regex pattern parser.
39 // generated by a Perl script.
49 //------------------------------------------------------------------------------
53 //------------------------------------------------------------------------------
54 RegexCompile::RegexCompile(RegexPattern
*rxp
, UErrorCode
&status
) :
55 fParenStack(status
), fSetStack(status
), fSetOpStack(status
)
57 // Lazy init of all shared global sets (needed for init()'s empty text)
58 RegexStaticSets::initGlobals(&status
);
69 fInBackslashQuote
= FALSE
;
70 fModeFlags
= fRXPat
->fFlags
| 0x80000000;
74 fMatchCloseParen
= -1;
76 fLastSetLiteral
= U_SENTINEL
;
78 if (U_SUCCESS(status
) && U_FAILURE(rxp
->fDeferredStatus
)) {
79 status
= rxp
->fDeferredStatus
;
83 static const UChar chAmp
= 0x26; // '&'
84 static const UChar chDash
= 0x2d; // '-'
87 //------------------------------------------------------------------------------
91 //------------------------------------------------------------------------------
92 RegexCompile::~RegexCompile() {
93 delete fCaptureName
; // Normally will be NULL, but can exist if pattern
94 // compilation stops with a syntax error.
97 static inline void addCategory(UnicodeSet
*set
, int32_t value
, UErrorCode
& ec
) {
98 set
->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK
, value
, ec
));
101 //------------------------------------------------------------------------------
103 // Compile regex pattern. The state machine for rexexp pattern parsing is here.
104 // The state tables are hand-written in the file regexcst.txt,
105 // and converted to the form used here by a perl
106 // script regexcst.pl
108 //------------------------------------------------------------------------------
109 void RegexCompile::compile(
110 const UnicodeString
&pat
, // Source pat to be compiled.
111 UParseError
&pp
, // Error position info
112 UErrorCode
&e
) // Error Code
114 fRXPat
->fPatternString
= new UnicodeString(pat
);
115 UText patternText
= UTEXT_INITIALIZER
;
116 utext_openConstUnicodeString(&patternText
, fRXPat
->fPatternString
, &e
);
119 compile(&patternText
, pp
, e
);
120 utext_close(&patternText
);
125 // compile, UText mode
126 // All the work is actually done here.
128 void RegexCompile::compile(
129 UText
*pat
, // Source pat to be compiled.
130 UParseError
&pp
, // Error position info
131 UErrorCode
&e
) // Error Code
136 fStack
[fStackPtr
] = 0;
138 if (U_FAILURE(*fStatus
)) {
142 // There should be no pattern stuff in the RegexPattern object. They can not be reused.
143 U_ASSERT(fRXPat
->fPattern
== NULL
|| utext_nativeLength(fRXPat
->fPattern
) == 0);
145 // Prepare the RegexPattern object to receive the compiled pattern.
146 fRXPat
->fPattern
= utext_clone(fRXPat
->fPattern
, pat
, FALSE
, TRUE
, fStatus
);
147 if (U_FAILURE(*fStatus
)) {
150 fRXPat
->fStaticSets
= RegexStaticSets::gStaticSets
->fPropSets
;
151 fRXPat
->fStaticSets8
= RegexStaticSets::gStaticSets
->fPropSets8
;
154 // Initialize the pattern scanning state machine
155 fPatternLength
= utext_nativeLength(pat
);
157 const RegexTableEl
*tableEl
;
159 // UREGEX_LITERAL force entire pattern to be treated as a literal string.
160 if (fModeFlags
& UREGEX_LITERAL
) {
164 nextChar(fC
); // Fetch the first char from the pattern string.
167 // Main loop for the regex pattern parsing state machine.
168 // Runs once per state transition.
169 // Each time through optionally performs, depending on the state table,
170 // - an advance to the the next pattern char
171 // - an action to be performed.
172 // - pushing or popping a state to/from the local state return stack.
173 // file regexcst.txt is the source for the state table. The logic behind
174 // recongizing the pattern syntax is there, not here.
177 // Bail out if anything has gone wrong.
178 // Regex pattern parsing stops on the first error encountered.
179 if (U_FAILURE(*fStatus
)) {
183 U_ASSERT(state
!= 0);
185 // Find the state table element that matches the input char from the pattern, or the
186 // class of the input character. Start with the first table row for this
187 // state, then linearly scan forward until we find a row that matches the
188 // character. The last row for each state always matches all characters, so
189 // the search will stop there, if not before.
191 tableEl
= &gRuleParseStateTable
[state
];
192 REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d) state=%s ",
193 fC
.fChar
, fLineNum
, fCharNum
, RegexStateNames
[state
]));
195 for (;;) { // loop through table rows belonging to this state, looking for one
196 // that matches the current input char.
197 REGEX_SCAN_DEBUG_PRINTF(("."));
198 if (tableEl
->fCharClass
< 127 && fC
.fQuoted
== FALSE
&& tableEl
->fCharClass
== fC
.fChar
) {
199 // Table row specified an individual character, not a set, and
200 // the input character is not quoted, and
201 // the input character matched it.
204 if (tableEl
->fCharClass
== 255) {
205 // Table row specified default, match anything character class.
208 if (tableEl
->fCharClass
== 254 && fC
.fQuoted
) {
209 // Table row specified "quoted" and the char was quoted.
212 if (tableEl
->fCharClass
== 253 && fC
.fChar
== (UChar32
)-1) {
213 // Table row specified eof and we hit eof on the input.
217 if (tableEl
->fCharClass
>= 128 && tableEl
->fCharClass
< 240 && // Table specs a char class &&
218 fC
.fQuoted
== FALSE
&& // char is not escaped &&
219 fC
.fChar
!= (UChar32
)-1) { // char is not EOF
220 U_ASSERT(tableEl
->fCharClass
<= 137);
221 if (RegexStaticSets::gStaticSets
->fRuleSets
[tableEl
->fCharClass
-128].contains(fC
.fChar
)) {
222 // Table row specified a character class, or set of characters,
223 // and the current char matches it.
228 // No match on this row, advance to the next row for this state,
231 REGEX_SCAN_DEBUG_PRINTF(("\n"));
234 // We've found the row of the state table that matches the current input
235 // character from the rules string.
236 // Perform any action specified by this row in the state table.
237 if (doParseActions(tableEl
->fAction
) == FALSE
) {
238 // Break out of the state machine loop if the
239 // the action signalled some kind of error, or
240 // the action was to exit, occurs on normal end-of-rules-input.
244 if (tableEl
->fPushState
!= 0) {
246 if (fStackPtr
>= kStackSize
) {
247 error(U_REGEX_INTERNAL_ERROR
);
248 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
251 fStack
[fStackPtr
] = tableEl
->fPushState
;
255 // NextChar. This is where characters are actually fetched from the pattern.
256 // Happens under control of the 'n' tag in the state table.
258 if (tableEl
->fNextChar
) {
262 // Get the next state from the table entry, or from the
263 // state stack if the next state was specified as "pop".
264 if (tableEl
->fNextState
!= 255) {
265 state
= tableEl
->fNextState
;
267 state
= fStack
[fStackPtr
];
270 // state stack underflow
271 // This will occur if the user pattern has mis-matched parentheses,
272 // with extra close parens.
275 error(U_REGEX_MISMATCHED_PAREN
);
281 if (U_FAILURE(*fStatus
)) {
282 // Bail out if the pattern had errors.
283 // Set stack cleanup: a successful compile would have left it empty,
284 // but errors can leave temporary sets hanging around.
285 while (!fSetStack
.empty()) {
286 delete (UnicodeSet
*)fSetStack
.pop();
292 // The pattern has now been read and processed, and the compiled code generated.
296 // The pattern's fFrameSize so far has accumulated the requirements for
297 // storage for capture parentheses, counters, etc. that are encountered
298 // in the pattern. Add space for the two variables that are always
299 // present in the saved state: the input string position (int64_t) and
300 // the position in the compiled pattern.
302 allocateStackData(RESTACKFRAME_HDRCOUNT
);
305 // Optimization pass 1: NOPs, back-references, and case-folding
310 // Get bounds for the minimum and maximum length of a string that this
311 // pattern can match. Used to avoid looking for matches in strings that
314 fRXPat
->fMinMatchLen
= minMatchLength(3, fRXPat
->fCompiledPat
->size()-1);
317 // Optimization pass 2: match start type
322 // Set up fast latin-1 range sets
324 int32_t numSets
= fRXPat
->fSets
->size();
325 fRXPat
->fSets8
= new Regex8BitSet
[numSets
];
326 // Null pointer check.
327 if (fRXPat
->fSets8
== NULL
) {
328 e
= *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
332 for (i
=0; i
<numSets
; i
++) {
333 UnicodeSet
*s
= (UnicodeSet
*)fRXPat
->fSets
->elementAt(i
);
334 fRXPat
->fSets8
[i
].init(s
);
343 //------------------------------------------------------------------------------
345 // doParseAction Do some action during regex pattern parsing.
346 // Called by the parse state machine.
348 // Generation of the match engine PCode happens here, or
349 // in functions called from the parse actions defined here.
352 //------------------------------------------------------------------------------
353 UBool
RegexCompile::doParseActions(int32_t action
)
355 UBool returnVal
= TRUE
;
357 switch ((Regex_PatternParseAction
)action
) {
360 // Start of pattern compiles to:
361 //0 SAVE 2 Fall back to position of FAIL
363 //2 FAIL Stop if we ever reach here.
364 //3 NOP Dummy, so start of pattern looks the same as
365 // the start of an ( grouping.
366 //4 NOP Resreved, will be replaced by a save if there are
367 // OR | operators at the top level
368 appendOp(URX_STATE_SAVE
, 2);
369 appendOp(URX_JMP
, 3);
370 appendOp(URX_FAIL
, 0);
372 // Standard open nonCapture paren action emits the two NOPs and
373 // sets up the paren stack frame.
374 doParseActions(doOpenNonCaptureParen
);
378 // We've scanned to the end of the pattern
379 // The end of pattern compiles to:
381 // which will stop the runtime match engine.
382 // Encountering end of pattern also behaves like a close paren,
383 // and forces fixups of the State Save at the beginning of the compiled pattern
384 // and of any OR operations at the top level.
387 if (fParenStack
.size() > 0) {
388 // Missing close paren in pattern.
389 error(U_REGEX_MISMATCHED_PAREN
);
392 // add the END operation to the compiled pattern.
393 appendOp(URX_END
, 0);
395 // Terminate the pattern compilation state machine.
402 // Scanning a '|', as in (A|B)
404 // Generate code for any pending literals preceding the '|'
407 // Insert a SAVE operation at the start of the pattern section preceding
408 // this OR at this level. This SAVE will branch the match forward
409 // to the right hand side of the OR in the event that the left hand
410 // side fails to match and backtracks. Locate the position for the
411 // save from the location on the top of the parentheses stack.
412 int32_t savePosition
= fParenStack
.popi();
413 int32_t op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(savePosition
);
414 U_ASSERT(URX_TYPE(op
) == URX_NOP
); // original contents of reserved location
415 op
= buildOp(URX_STATE_SAVE
, fRXPat
->fCompiledPat
->size()+1);
416 fRXPat
->fCompiledPat
->setElementAt(op
, savePosition
);
418 // Append an JMP operation into the compiled pattern. The operand for
419 // the JMP will eventually be the location following the ')' for the
420 // group. This will be patched in later, when the ')' is encountered.
421 appendOp(URX_JMP
, 0);
423 // Push the position of the newly added JMP op onto the parentheses stack.
424 // This registers if for fixup when this block's close paren is encountered.
425 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
);
427 // Append a NOP to the compiled pattern. This is the slot reserved
428 // for a SAVE in the event that there is yet another '|' following
430 appendOp(URX_NOP
, 0);
431 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
);
436 case doBeginNamedCapture
:
437 // Scanning (?<letter.
438 // The first letter of the name will come through again under doConinueNamedCapture.
439 fCaptureName
= new UnicodeString();
440 if (fCaptureName
== NULL
) {
441 error(U_MEMORY_ALLOCATION_ERROR
);
445 case doContinueNamedCapture
:
446 fCaptureName
->append(fC
.fChar
);
449 case doBadNamedCapture
:
450 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
453 case doOpenCaptureParen
:
454 // Open Capturing Paren, possibly named.
456 // - NOP, which later may be replaced by a save-state if the
457 // parenthesized group gets a * quantifier, followed by
458 // - START_CAPTURE n where n is stack frame offset to the capture group variables.
459 // - NOP, which may later be replaced by a save-state if there
460 // is an '|' alternation within the parens.
462 // Each capture group gets three slots in the save stack frame:
463 // 0: Capture Group start position (in input string being matched.)
464 // 1: Capture Group end position.
465 // 2: Start of Match-in-progress.
466 // The first two locations are for a completed capture group, and are
467 // referred to by back references and the like.
468 // The third location stores the capture start position when an START_CAPTURE is
469 // encountered. This will be promoted to a completed capture when (and if) the corresponding
470 // END_CAPTURE is encountered.
473 appendOp(URX_NOP
, 0);
474 int32_t varsLoc
= allocateStackData(3); // Reserve three slots in match stack frame.
475 appendOp(URX_START_CAPTURE
, varsLoc
);
476 appendOp(URX_NOP
, 0);
478 // On the Parentheses stack, start a new frame and add the postions
479 // of the two NOPs. Depending on what follows in the pattern, the
480 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
481 // address of the end of the parenthesized group.
482 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
483 fParenStack
.push(capturing
, *fStatus
); // Frame type.
484 fParenStack
.push(fRXPat
->fCompiledPat
->size()-3, *fStatus
); // The first NOP location
485 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP loc
487 // Save the mapping from group number to stack frame variable position.
488 fRXPat
->fGroupMap
->addElement(varsLoc
, *fStatus
);
490 // If this is a named capture group, add the name->group number mapping.
491 if (fCaptureName
!= NULL
) {
492 int32_t groupNumber
= fRXPat
->fGroupMap
->size();
493 int32_t previousMapping
= uhash_puti(fRXPat
->fNamedCaptureMap
, fCaptureName
, groupNumber
, fStatus
);
494 fCaptureName
= NULL
; // hash table takes ownership of the name (key) string.
495 if (previousMapping
> 0 && U_SUCCESS(*fStatus
)) {
496 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
502 case doOpenNonCaptureParen
:
503 // Open non-caputuring (grouping only) Paren.
505 // - NOP, which later may be replaced by a save-state if the
506 // parenthesized group gets a * quantifier, followed by
507 // - NOP, which may later be replaced by a save-state if there
508 // is an '|' alternation within the parens.
511 appendOp(URX_NOP
, 0);
512 appendOp(URX_NOP
, 0);
514 // On the Parentheses stack, start a new frame and add the postions
516 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
517 fParenStack
.push(plain
, *fStatus
); // Begin a new frame.
518 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The first NOP location
519 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP loc
524 case doOpenAtomicParen
:
525 // Open Atomic Paren. (?>
527 // - NOP, which later may be replaced if the parenthesized group
528 // has a quantifier, followed by
529 // - STO_SP save state stack position, so it can be restored at the ")"
530 // - NOP, which may later be replaced by a save-state if there
531 // is an '|' alternation within the parens.
534 appendOp(URX_NOP
, 0);
535 int32_t varLoc
= allocateData(1); // Reserve a data location for saving the state stack ptr.
536 appendOp(URX_STO_SP
, varLoc
);
537 appendOp(URX_NOP
, 0);
539 // On the Parentheses stack, start a new frame and add the postions
540 // of the two NOPs. Depending on what follows in the pattern, the
541 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
542 // address of the end of the parenthesized group.
543 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
544 fParenStack
.push(atomic
, *fStatus
); // Frame type.
545 fParenStack
.push(fRXPat
->fCompiledPat
->size()-3, *fStatus
); // The first NOP
546 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP
551 case doOpenLookAhead
:
552 // Positive Look-ahead (?= stuff )
554 // Note: Addition of transparent input regions, with the need to
555 // restore the original regions when failing out of a lookahead
556 // block, complicated this sequence. Some conbined opcodes
557 // might make sense - or might not, lookahead aren't that common.
559 // Caution: min match length optimization knows about this
560 // sequence; don't change without making updates there too.
563 // 1 START_LA dataLoc Saves SP, Input Pos
564 // 2. STATE_SAVE 4 on failure of lookahead, goto 4
565 // 3 JMP 6 continue ...
567 // 4. LA_END Look Ahead failed. Restore regions.
568 // 5. BACKTRACK and back track again.
570 // 6. NOP reserved for use by quantifiers on the block.
571 // Look-ahead can't have quantifiers, but paren stack
572 // compile time conventions require the slot anyhow.
573 // 7. NOP may be replaced if there is are '|' ops in the block.
574 // 8. code for parenthesized stuff.
577 // Two data slots are reserved, for saving the stack ptr and the input position.
580 int32_t dataLoc
= allocateData(2);
581 appendOp(URX_LA_START
, dataLoc
);
582 appendOp(URX_STATE_SAVE
, fRXPat
->fCompiledPat
->size()+ 2);
583 appendOp(URX_JMP
, fRXPat
->fCompiledPat
->size()+ 3);
584 appendOp(URX_LA_END
, dataLoc
);
585 appendOp(URX_BACKTRACK
, 0);
586 appendOp(URX_NOP
, 0);
587 appendOp(URX_NOP
, 0);
589 // On the Parentheses stack, start a new frame and add the postions
591 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
592 fParenStack
.push(lookAhead
, *fStatus
); // Frame type.
593 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The first NOP location
594 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP location
598 case doOpenLookAheadNeg
:
599 // Negated Lookahead. (?! stuff )
601 // 1. START_LA dataloc
602 // 2. SAVE_STATE 7 // Fail within look-ahead block restores to this state,
603 // // which continues with the match.
604 // 3. NOP // Std. Open Paren sequence, for possible '|'
605 // 4. code for parenthesized stuff.
606 // 5. END_LA // Cut back stack, remove saved state from step 2.
607 // 6. BACKTRACK // code in block succeeded, so neg. lookahead fails.
608 // 7. END_LA // Restore match region, in case look-ahead was using
609 // an alternate (transparent) region.
612 int32_t dataLoc
= allocateData(2);
613 appendOp(URX_LA_START
, dataLoc
);
614 appendOp(URX_STATE_SAVE
, 0); // dest address will be patched later.
615 appendOp(URX_NOP
, 0);
617 // On the Parentheses stack, start a new frame and add the postions
618 // of the StateSave and NOP.
619 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
620 fParenStack
.push(negLookAhead
, *fStatus
); // Frame type
621 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The STATE_SAVE location
622 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP location
624 // Instructions #5 - #7 will be added when the ')' is encountered.
628 case doOpenLookBehind
:
630 // Compile a (?<= look-behind open paren.
633 // 0 URX_LB_START dataLoc
634 // 1 URX_LB_CONT dataLoc
637 // 4 URX_NOP Standard '(' boilerplate.
638 // 5 URX_NOP Reserved slot for use with '|' ops within (block).
639 // 6 <code for LookBehind expression>
640 // 7 URX_LB_END dataLoc # Check match len, restore input len
641 // 8 URX_LA_END dataLoc # Restore stack, input pos
643 // Allocate a block of matcher data, to contain (when running a match)
644 // 0: Stack ptr on entry
645 // 1: Input Index on entry
646 // 2: Start index of match current match attempt.
647 // 3: Original Input String len.
649 // Generate match code for any pending literals.
652 // Allocate data space
653 int32_t dataLoc
= allocateData(4);
656 appendOp(URX_LB_START
, dataLoc
);
659 appendOp(URX_LB_CONT
, dataLoc
);
660 appendOp(URX_RESERVED_OP
, 0); // MinMatchLength. To be filled later.
661 appendOp(URX_RESERVED_OP
, 0); // MaxMatchLength. To be filled later.
664 appendOp(URX_NOP
, 0);
665 appendOp(URX_NOP
, 0);
667 // On the Parentheses stack, start a new frame and add the postions
668 // of the URX_LB_CONT and the NOP.
669 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
670 fParenStack
.push(lookBehind
, *fStatus
); // Frame type
671 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The first NOP location
672 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The 2nd NOP location
674 // The final two instructions will be added when the ')' is encountered.
679 case doOpenLookBehindNeg
:
681 // Compile a (?<! negated look-behind open paren.
684 // 0 URX_LB_START dataLoc # Save entry stack, input len
685 // 1 URX_LBN_CONT dataLoc # Iterate possible match positions
689 // 5 URX_NOP Standard '(' boilerplate.
690 // 6 URX_NOP Reserved slot for use with '|' ops within (block).
691 // 7 <code for LookBehind expression>
692 // 8 URX_LBN_END dataLoc # Check match len, cause a FAIL
695 // Allocate a block of matcher data, to contain (when running a match)
696 // 0: Stack ptr on entry
697 // 1: Input Index on entry
698 // 2: Start index of match current match attempt.
699 // 3: Original Input String len.
701 // Generate match code for any pending literals.
704 // Allocate data space
705 int32_t dataLoc
= allocateData(4);
708 appendOp(URX_LB_START
, dataLoc
);
711 appendOp(URX_LBN_CONT
, dataLoc
);
712 appendOp(URX_RESERVED_OP
, 0); // MinMatchLength. To be filled later.
713 appendOp(URX_RESERVED_OP
, 0); // MaxMatchLength. To be filled later.
714 appendOp(URX_RESERVED_OP
, 0); // Continue Loc. To be filled later.
717 appendOp(URX_NOP
, 0);
718 appendOp(URX_NOP
, 0);
720 // On the Parentheses stack, start a new frame and add the postions
721 // of the URX_LB_CONT and the NOP.
722 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
723 fParenStack
.push(lookBehindN
, *fStatus
); // Frame type
724 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The first NOP location
725 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The 2nd NOP location
727 // The final two instructions will be added when the ')' is encountered.
731 case doConditionalExpr
:
732 // Conditionals such as (?(1)a:b)
734 // Perl inline-condtionals. (?{perl code}a|b) We're not perl, no way to do them.
735 error(U_REGEX_UNIMPLEMENTED
);
741 if (fParenStack
.size() <= 0) {
742 // Extra close paren, or missing open paren.
743 error(U_REGEX_MISMATCHED_PAREN
);
751 case doBadOpenParenType
:
753 error(U_REGEX_RULE_SYNTAX
);
757 case doMismatchedParenErr
:
758 error(U_REGEX_MISMATCHED_PAREN
);
762 // Normal '+' compiles to
763 // 1. stuff to be repeated (already built)
767 // Or, if the item to be repeated can match a zero length string,
768 // 1. STO_INP_LOC data-loc
769 // 2. body of stuff to be repeated
774 // Or, if the item to be repeated is simple
775 // 1. Item to be repeated.
776 // 2. LOOP_SR_I set number (assuming repeated item is a set ref)
777 // 3. LOOP_C stack location
779 int32_t topLoc
= blockTopLoc(FALSE
); // location of item #1
782 // Check for simple constructs, which may get special optimized code.
783 if (topLoc
== fRXPat
->fCompiledPat
->size() - 1) {
784 int32_t repeatedOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(topLoc
);
786 if (URX_TYPE(repeatedOp
) == URX_SETREF
) {
787 // Emit optimized code for [char set]+
788 appendOp(URX_LOOP_SR_I
, URX_VAL(repeatedOp
));
789 frameLoc
= allocateStackData(1);
790 appendOp(URX_LOOP_C
, frameLoc
);
794 if (URX_TYPE(repeatedOp
) == URX_DOTANY
||
795 URX_TYPE(repeatedOp
) == URX_DOTANY_ALL
||
796 URX_TYPE(repeatedOp
) == URX_DOTANY_UNIX
) {
797 // Emit Optimized code for .+ operations.
798 int32_t loopOpI
= buildOp(URX_LOOP_DOT_I
, 0);
799 if (URX_TYPE(repeatedOp
) == URX_DOTANY_ALL
) {
800 // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
803 if (fModeFlags
& UREGEX_UNIX_LINES
) {
807 frameLoc
= allocateStackData(1);
808 appendOp(URX_LOOP_C
, frameLoc
);
816 // Check for minimum match length of zero, which requires
817 // extra loop-breaking code.
818 if (minMatchLength(topLoc
, fRXPat
->fCompiledPat
->size()-1) == 0) {
819 // Zero length match is possible.
820 // Emit the code sequence that can handle it.
822 frameLoc
= allocateStackData(1);
824 int32_t op
= buildOp(URX_STO_INP_LOC
, frameLoc
);
825 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
);
827 appendOp(URX_JMP_SAV_X
, topLoc
+1);
829 // Simpler code when the repeated body must match something non-empty
830 appendOp(URX_JMP_SAV
, topLoc
);
836 // Non-greedy '+?' compiles to
837 // 1. stuff to be repeated (already built)
841 int32_t topLoc
= blockTopLoc(FALSE
);
842 appendOp(URX_STATE_SAVE
, topLoc
);
848 // Normal (greedy) ? quantifier.
851 // 2. body of optional block
853 // Insert the state save into the compiled pattern, and we're done.
855 int32_t saveStateLoc
= blockTopLoc(TRUE
);
856 int32_t saveStateOp
= buildOp(URX_STATE_SAVE
, fRXPat
->fCompiledPat
->size());
857 fRXPat
->fCompiledPat
->setElementAt(saveStateOp
, saveStateLoc
);
862 // Non-greedy ?? quantifier
865 // 2. body of optional block
869 // This code is less than ideal, with two jmps instead of one, because we can only
870 // insert one instruction at the top of the block being iterated.
872 int32_t jmp1_loc
= blockTopLoc(TRUE
);
873 int32_t jmp2_loc
= fRXPat
->fCompiledPat
->size();
875 int32_t jmp1_op
= buildOp(URX_JMP
, jmp2_loc
+1);
876 fRXPat
->fCompiledPat
->setElementAt(jmp1_op
, jmp1_loc
);
878 appendOp(URX_JMP
, jmp2_loc
+2);
880 appendOp(URX_STATE_SAVE
, jmp1_loc
+1);
886 // Normal (greedy) * quantifier.
889 // 2. body of stuff being iterated over
893 // Or, if the body is a simple [Set],
894 // 1. LOOP_SR_I set number
895 // 2. LOOP_C stack location
898 // Or if this is a .*
899 // 1. LOOP_DOT_I (. matches all mode flag)
900 // 2. LOOP_C stack location
902 // Or, if the body can match a zero-length string, to inhibit infinite loops,
904 // 2. STO_INP_LOC data-loc
909 // location of item #1, the STATE_SAVE
910 int32_t topLoc
= blockTopLoc(FALSE
);
911 int32_t dataLoc
= -1;
913 // Check for simple *, where the construct being repeated
914 // compiled to single opcode, and might be optimizable.
915 if (topLoc
== fRXPat
->fCompiledPat
->size() - 1) {
916 int32_t repeatedOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(topLoc
);
918 if (URX_TYPE(repeatedOp
) == URX_SETREF
) {
919 // Emit optimized code for a [char set]*
920 int32_t loopOpI
= buildOp(URX_LOOP_SR_I
, URX_VAL(repeatedOp
));
921 fRXPat
->fCompiledPat
->setElementAt(loopOpI
, topLoc
);
922 dataLoc
= allocateStackData(1);
923 appendOp(URX_LOOP_C
, dataLoc
);
927 if (URX_TYPE(repeatedOp
) == URX_DOTANY
||
928 URX_TYPE(repeatedOp
) == URX_DOTANY_ALL
||
929 URX_TYPE(repeatedOp
) == URX_DOTANY_UNIX
) {
930 // Emit Optimized code for .* operations.
931 int32_t loopOpI
= buildOp(URX_LOOP_DOT_I
, 0);
932 if (URX_TYPE(repeatedOp
) == URX_DOTANY_ALL
) {
933 // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
936 if ((fModeFlags
& UREGEX_UNIX_LINES
) != 0) {
939 fRXPat
->fCompiledPat
->setElementAt(loopOpI
, topLoc
);
940 dataLoc
= allocateStackData(1);
941 appendOp(URX_LOOP_C
, dataLoc
);
946 // Emit general case code for this *
947 // The optimizations did not apply.
949 int32_t saveStateLoc
= blockTopLoc(TRUE
);
950 int32_t jmpOp
= buildOp(URX_JMP_SAV
, saveStateLoc
+1);
952 // Check for minimum match length of zero, which requires
953 // extra loop-breaking code.
954 if (minMatchLength(saveStateLoc
, fRXPat
->fCompiledPat
->size()-1) == 0) {
955 insertOp(saveStateLoc
);
956 dataLoc
= allocateStackData(1);
958 int32_t op
= buildOp(URX_STO_INP_LOC
, dataLoc
);
959 fRXPat
->fCompiledPat
->setElementAt(op
, saveStateLoc
+1);
960 jmpOp
= buildOp(URX_JMP_SAV_X
, saveStateLoc
+2);
963 // Locate the position in the compiled pattern where the match will continue
964 // after completing the *. (4 or 5 in the comment above)
965 int32_t continueLoc
= fRXPat
->fCompiledPat
->size()+1;
967 // Put together the save state op and store it into the compiled code.
968 int32_t saveStateOp
= buildOp(URX_STATE_SAVE
, continueLoc
);
969 fRXPat
->fCompiledPat
->setElementAt(saveStateOp
, saveStateLoc
);
971 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
977 // Non-greedy *? quantifier
980 // 2. body of stuff being iterated over
984 int32_t jmpLoc
= blockTopLoc(TRUE
); // loc 1.
985 int32_t saveLoc
= fRXPat
->fCompiledPat
->size(); // loc 3.
986 int32_t jmpOp
= buildOp(URX_JMP
, saveLoc
);
987 fRXPat
->fCompiledPat
->setElementAt(jmpOp
, jmpLoc
);
988 appendOp(URX_STATE_SAVE
, jmpLoc
+1);
994 // The '{' opening an interval quantifier was just scanned.
995 // Init the counter varaiables that will accumulate the values as the digits
1001 case doIntevalLowerDigit
:
1002 // Scanned a digit from the lower value of an {lower,upper} interval
1004 int32_t digitValue
= u_charDigitValue(fC
.fChar
);
1005 U_ASSERT(digitValue
>= 0);
1006 int64_t val
= (int64_t)fIntervalLow
*10 + digitValue
;
1007 if (val
> INT32_MAX
) {
1008 error(U_REGEX_NUMBER_TOO_BIG
);
1010 fIntervalLow
= (int32_t)val
;
1015 case doIntervalUpperDigit
:
1016 // Scanned a digit from the upper value of an {lower,upper} interval
1018 if (fIntervalUpper
< 0) {
1021 int32_t digitValue
= u_charDigitValue(fC
.fChar
);
1022 U_ASSERT(digitValue
>= 0);
1023 int64_t val
= (int64_t)fIntervalUpper
*10 + digitValue
;
1024 if (val
> INT32_MAX
) {
1025 error(U_REGEX_NUMBER_TOO_BIG
);
1027 fIntervalUpper
= (int32_t)val
;
1032 case doIntervalSame
:
1033 // Scanned a single value interval like {27}. Upper = Lower.
1034 fIntervalUpper
= fIntervalLow
;
1038 // Finished scanning a normal {lower,upper} interval. Generate the code for it.
1039 if (compileInlineInterval() == FALSE
) {
1040 compileInterval(URX_CTR_INIT
, URX_CTR_LOOP
);
1044 case doPossessiveInterval
:
1045 // Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it.
1047 // Remember the loc for the top of the block being looped over.
1048 // (Can not reserve a slot in the compiled pattern at this time, because
1049 // compileInterval needs to reserve also, and blockTopLoc can only reserve
1051 int32_t topLoc
= blockTopLoc(FALSE
);
1053 // Produce normal looping code.
1054 compileInterval(URX_CTR_INIT
, URX_CTR_LOOP
);
1056 // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
1057 // just as if the loop was inclosed in atomic parentheses.
1059 // First the STO_SP before the start of the loop
1062 int32_t varLoc
= allocateData(1); // Reserve a data location for saving the
1063 int32_t op
= buildOp(URX_STO_SP
, varLoc
);
1064 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
);
1066 int32_t loopOp
= (int32_t)fRXPat
->fCompiledPat
->popi();
1067 U_ASSERT(URX_TYPE(loopOp
) == URX_CTR_LOOP
&& URX_VAL(loopOp
) == topLoc
);
1068 loopOp
++; // point LoopOp after the just-inserted STO_SP
1069 fRXPat
->fCompiledPat
->push(loopOp
, *fStatus
);
1071 // Then the LD_SP after the end of the loop
1072 appendOp(URX_LD_SP
, varLoc
);
1078 // Finished scanning a non-greedy {lower,upper}? interval. Generate the code for it.
1079 compileInterval(URX_CTR_INIT_NG
, URX_CTR_LOOP_NG
);
1082 case doIntervalError
:
1083 error(U_REGEX_BAD_INTERVAL
);
1087 // We've just scanned a "normal" character from the pattern,
1088 literalChar(fC
.fChar
);
1092 case doEscapedLiteralChar
:
1093 // We've just scanned an backslashed escaped character with no
1094 // special meaning. It represents itself.
1095 if ((fModeFlags
& UREGEX_ERROR_ON_UNKNOWN_ESCAPES
) != 0 &&
1096 ((fC
.fChar
>= 0x41 && fC
.fChar
<= 0x5A) || // in [A-Z]
1097 (fC
.fChar
>= 0x61 && fC
.fChar
<= 0x7a))) { // in [a-z]
1098 error(U_REGEX_BAD_ESCAPE_SEQUENCE
);
1100 literalChar(fC
.fChar
);
1105 // scanned a ".", match any single character.
1108 if (fModeFlags
& UREGEX_DOTALL
) {
1109 appendOp(URX_DOTANY_ALL
, 0);
1110 } else if (fModeFlags
& UREGEX_UNIX_LINES
) {
1111 appendOp(URX_DOTANY_UNIX
, 0);
1113 appendOp(URX_DOTANY
, 0);
1121 if ( (fModeFlags
& UREGEX_MULTILINE
) == 0 && (fModeFlags
& UREGEX_UNIX_LINES
) == 0) {
1122 appendOp(URX_CARET
, 0);
1123 } else if ((fModeFlags
& UREGEX_MULTILINE
) != 0 && (fModeFlags
& UREGEX_UNIX_LINES
) == 0) {
1124 appendOp(URX_CARET_M
, 0);
1125 } else if ((fModeFlags
& UREGEX_MULTILINE
) == 0 && (fModeFlags
& UREGEX_UNIX_LINES
) != 0) {
1126 appendOp(URX_CARET
, 0); // Only testing true start of input.
1127 } else if ((fModeFlags
& UREGEX_MULTILINE
) != 0 && (fModeFlags
& UREGEX_UNIX_LINES
) != 0) {
1128 appendOp(URX_CARET_M_UNIX
, 0);
1136 if ( (fModeFlags
& UREGEX_MULTILINE
) == 0 && (fModeFlags
& UREGEX_UNIX_LINES
) == 0) {
1137 appendOp(URX_DOLLAR
, 0);
1138 } else if ((fModeFlags
& UREGEX_MULTILINE
) != 0 && (fModeFlags
& UREGEX_UNIX_LINES
) == 0) {
1139 appendOp(URX_DOLLAR_M
, 0);
1140 } else if ((fModeFlags
& UREGEX_MULTILINE
) == 0 && (fModeFlags
& UREGEX_UNIX_LINES
) != 0) {
1141 appendOp(URX_DOLLAR_D
, 0);
1142 } else if ((fModeFlags
& UREGEX_MULTILINE
) != 0 && (fModeFlags
& UREGEX_UNIX_LINES
) != 0) {
1143 appendOp(URX_DOLLAR_MD
, 0);
1150 appendOp(URX_CARET
, 0);
1155 #if UCONFIG_NO_BREAK_ITERATION==1
1156 if (fModeFlags
& UREGEX_UWORD
) {
1157 error(U_UNSUPPORTED_ERROR
);
1161 int32_t op
= (fModeFlags
& UREGEX_UWORD
)? URX_BACKSLASH_BU
: URX_BACKSLASH_B
;
1168 #if UCONFIG_NO_BREAK_ITERATION==1
1169 if (fModeFlags
& UREGEX_UWORD
) {
1170 error(U_UNSUPPORTED_ERROR
);
1174 int32_t op
= (fModeFlags
& UREGEX_UWORD
)? URX_BACKSLASH_BU
: URX_BACKSLASH_B
;
1181 appendOp(URX_BACKSLASH_D
, 1);
1186 appendOp(URX_BACKSLASH_D
, 0);
1191 appendOp(URX_BACKSLASH_G
, 0);
1196 appendOp(URX_BACKSLASH_H
, 1);
1201 appendOp(URX_BACKSLASH_H
, 0);
1206 appendOp(URX_BACKSLASH_R
, 0);
1211 appendOp(URX_STAT_SETREF_N
, URX_ISSPACE_SET
);
1216 appendOp(URX_STATIC_SETREF
, URX_ISSPACE_SET
);
1221 appendOp(URX_BACKSLASH_V
, 1);
1226 appendOp(URX_BACKSLASH_V
, 0);
1231 appendOp(URX_STAT_SETREF_N
, URX_ISWORD_SET
);
1236 appendOp(URX_STATIC_SETREF
, URX_ISWORD_SET
);
1241 appendOp(URX_BACKSLASH_X
, 0);
1247 appendOp(URX_DOLLAR
, 0);
1252 appendOp(URX_BACKSLASH_Z
, 0);
1256 error(U_REGEX_BAD_ESCAPE_SEQUENCE
);
1267 UnicodeSet
*theSet
= scanProp();
1274 UChar32 c
= scanNamedChar();
1281 // BackReference. Somewhat unusual in that the front-end can not completely parse
1282 // the regular expression, because the number of digits to be consumed
1283 // depends on the number of capture groups that have been defined. So
1284 // we have to do it here instead.
1286 int32_t numCaptureGroups
= fRXPat
->fGroupMap
->size();
1287 int32_t groupNum
= 0;
1288 UChar32 c
= fC
.fChar
;
1291 // Loop once per digit, for max allowed number of digits in a back reference.
1292 int32_t digit
= u_charDigitValue(c
);
1293 groupNum
= groupNum
* 10 + digit
;
1294 if (groupNum
>= numCaptureGroups
) {
1298 if (RegexStaticSets::gStaticSets
->fRuleDigitsAlias
->contains(c
) == FALSE
) {
1304 // Scan of the back reference in the source regexp is complete. Now generate
1305 // the compiled code for it.
1306 // Because capture groups can be forward-referenced by back-references,
1307 // we fill the operand with the capture group number. At the end
1308 // of compilation, it will be changed to the variable's location.
1309 U_ASSERT(groupNum
> 0); // Shouldn't happen. '\0' begins an octal escape sequence,
1310 // and shouldn't enter this code path at all.
1312 if (fModeFlags
& UREGEX_CASE_INSENSITIVE
) {
1313 appendOp(URX_BACKREF_I
, groupNum
);
1315 appendOp(URX_BACKREF
, groupNum
);
1320 case doBeginNamedBackRef
:
1321 U_ASSERT(fCaptureName
== NULL
);
1322 fCaptureName
= new UnicodeString
;
1323 if (fCaptureName
== NULL
) {
1324 error(U_MEMORY_ALLOCATION_ERROR
);
1328 case doContinueNamedBackRef
:
1329 fCaptureName
->append(fC
.fChar
);
1332 case doCompleteNamedBackRef
:
1334 int32_t groupNumber
= uhash_geti(fRXPat
->fNamedCaptureMap
, fCaptureName
);
1335 if (groupNumber
== 0) {
1336 // Group name has not been defined.
1337 // Could be a forward reference. If we choose to support them at some
1338 // future time, extra mechanism will be required at this point.
1339 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
1341 // Given the number, handle identically to a \n numbered back reference.
1342 // See comments above, under doBackRef
1344 if (fModeFlags
& UREGEX_CASE_INSENSITIVE
) {
1345 appendOp(URX_BACKREF_I
, groupNumber
);
1347 appendOp(URX_BACKREF
, groupNumber
);
1350 delete fCaptureName
;
1351 fCaptureName
= NULL
;
1355 case doPossessivePlus
:
1356 // Possessive ++ quantifier.
1359 // 2. body of stuff being iterated over
1365 // Note: TODO: This is pretty inefficient. A mass of saved state is built up
1366 // then unconditionally discarded. Perhaps introduce a new opcode. Ticket 6056
1370 int32_t topLoc
= blockTopLoc(TRUE
);
1371 int32_t stoLoc
= allocateData(1); // Reserve the data location for storing save stack ptr.
1372 int32_t op
= buildOp(URX_STO_SP
, stoLoc
);
1373 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
);
1375 // Emit the STATE_SAVE
1376 appendOp(URX_STATE_SAVE
, fRXPat
->fCompiledPat
->size()+2);
1379 appendOp(URX_JMP
, topLoc
+1);
1382 appendOp(URX_LD_SP
, stoLoc
);
1386 case doPossessiveStar
:
1387 // Possessive *+ quantifier.
1391 // 3. body of stuff being iterated over
1395 // TODO: do something to cut back the state stack each time through the loop.
1397 // Reserve two slots at the top of the block.
1398 int32_t topLoc
= blockTopLoc(TRUE
);
1402 int32_t stoLoc
= allocateData(1); // Reserve the data location for storing save stack ptr.
1403 int32_t op
= buildOp(URX_STO_SP
, stoLoc
);
1404 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
);
1406 // Emit the SAVE_STATE 5
1407 int32_t L7
= fRXPat
->fCompiledPat
->size()+1;
1408 op
= buildOp(URX_STATE_SAVE
, L7
);
1409 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
+1);
1411 // Append the JMP operation.
1412 appendOp(URX_JMP
, topLoc
+1);
1414 // Emit the LD_SP loc
1415 appendOp(URX_LD_SP
, stoLoc
);
1419 case doPossessiveOpt
:
1420 // Possessive ?+ quantifier.
1424 // 3. body of optional block
1429 // Reserve two slots at the top of the block.
1430 int32_t topLoc
= blockTopLoc(TRUE
);
1434 int32_t stoLoc
= allocateData(1); // Reserve the data location for storing save stack ptr.
1435 int32_t op
= buildOp(URX_STO_SP
, stoLoc
);
1436 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
);
1438 // Emit the SAVE_STATE
1439 int32_t continueLoc
= fRXPat
->fCompiledPat
->size()+1;
1440 op
= buildOp(URX_STATE_SAVE
, continueLoc
);
1441 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
+1);
1444 appendOp(URX_LD_SP
, stoLoc
);
1449 case doBeginMatchMode
:
1450 fNewModeFlags
= fModeFlags
;
1451 fSetModeFlag
= TRUE
;
1454 case doMatchMode
: // (?i) and similar
1458 case 0x69: /* 'i' */ bit
= UREGEX_CASE_INSENSITIVE
; break;
1459 case 0x64: /* 'd' */ bit
= UREGEX_UNIX_LINES
; break;
1460 case 0x6d: /* 'm' */ bit
= UREGEX_MULTILINE
; break;
1461 case 0x73: /* 's' */ bit
= UREGEX_DOTALL
; break;
1462 case 0x75: /* 'u' */ bit
= 0; /* Unicode casing */ break;
1463 case 0x77: /* 'w' */ bit
= UREGEX_UWORD
; break;
1464 case 0x78: /* 'x' */ bit
= UREGEX_COMMENTS
; break;
1465 case 0x2d: /* '-' */ fSetModeFlag
= FALSE
; break;
1467 U_ASSERT(FALSE
); // Should never happen. Other chars are filtered out
1471 fNewModeFlags
|= bit
;
1473 fNewModeFlags
&= ~bit
;
1478 case doSetMatchMode
:
1479 // Emit code to match any pending literals, using the not-yet changed match mode.
1482 // We've got a (?i) or similar. The match mode is being changed, but
1483 // the change is not scoped to a parenthesized block.
1484 U_ASSERT(fNewModeFlags
< 0);
1485 fModeFlags
= fNewModeFlags
;
1490 case doMatchModeParen
:
1491 // We've got a (?i: or similar. Begin a parenthesized block, save old
1492 // mode flags so they can be restored at the close of the block.
1495 // - NOP, which later may be replaced by a save-state if the
1496 // parenthesized group gets a * quantifier, followed by
1497 // - NOP, which may later be replaced by a save-state if there
1498 // is an '|' alternation within the parens.
1501 appendOp(URX_NOP
, 0);
1502 appendOp(URX_NOP
, 0);
1504 // On the Parentheses stack, start a new frame and add the postions
1505 // of the two NOPs (a normal non-capturing () frame, except for the
1506 // saving of the orignal mode flags.)
1507 fParenStack
.push(fModeFlags
, *fStatus
);
1508 fParenStack
.push(flags
, *fStatus
); // Frame Marker
1509 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The first NOP
1510 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP
1512 // Set the current mode flags to the new values.
1513 U_ASSERT(fNewModeFlags
< 0);
1514 fModeFlags
= fNewModeFlags
;
1519 error(U_REGEX_INVALID_FLAG
);
1522 case doSuppressComments
:
1523 // We have just scanned a '(?'. We now need to prevent the character scanner from
1524 // treating a '#' as a to-the-end-of-line comment.
1525 // (This Perl compatibility just gets uglier and uglier to do...)
1526 fEOLComments
= FALSE
;
1532 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1539 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1544 case doSetBackslash_s
:
1546 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1547 set
->addAll(*RegexStaticSets::gStaticSets
->fPropSets
[URX_ISSPACE_SET
]);
1551 case doSetBackslash_S
:
1553 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1554 UnicodeSet
SSet(*RegexStaticSets::gStaticSets
->fPropSets
[URX_ISSPACE_SET
]);
1560 case doSetBackslash_d
:
1562 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1563 // TODO - make a static set, ticket 6058.
1564 addCategory(set
, U_GC_ND_MASK
, *fStatus
);
1568 case doSetBackslash_D
:
1570 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1572 // TODO - make a static set, ticket 6058.
1573 digits
.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK
, U_GC_ND_MASK
, *fStatus
);
1574 digits
.complement();
1575 set
->addAll(digits
);
1579 case doSetBackslash_h
:
1581 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1583 h
.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK
, U_GC_ZS_MASK
, *fStatus
);
1584 h
.add((UChar32
)9); // Tab
1589 case doSetBackslash_H
:
1591 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1593 h
.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK
, U_GC_ZS_MASK
, *fStatus
);
1594 h
.add((UChar32
)9); // Tab
1600 case doSetBackslash_v
:
1602 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1603 set
->add((UChar32
)0x0a, (UChar32
)0x0d); // add range
1604 set
->add((UChar32
)0x85);
1605 set
->add((UChar32
)0x2028, (UChar32
)0x2029);
1609 case doSetBackslash_V
:
1611 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1613 v
.add((UChar32
)0x0a, (UChar32
)0x0d); // add range
1614 v
.add((UChar32
)0x85);
1615 v
.add((UChar32
)0x2028, (UChar32
)0x2029);
1621 case doSetBackslash_w
:
1623 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1624 set
->addAll(*RegexStaticSets::gStaticSets
->fPropSets
[URX_ISWORD_SET
]);
1628 case doSetBackslash_W
:
1630 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1631 UnicodeSet
SSet(*RegexStaticSets::gStaticSets
->fPropSets
[URX_ISWORD_SET
]);
1639 fSetStack
.push(new UnicodeSet(), *fStatus
);
1640 fSetOpStack
.push(setStart
, *fStatus
);
1641 if ((fModeFlags
& UREGEX_CASE_INSENSITIVE
) != 0) {
1642 fSetOpStack
.push(setCaseClose
, *fStatus
);
1646 case doSetBeginDifference1
:
1647 // We have scanned something like [[abc]-[
1648 // Set up a new UnicodeSet for the set beginning with the just-scanned '['
1649 // Push a Difference operator, which will cause the new set to be subtracted from what
1650 // went before once it is created.
1651 setPushOp(setDifference1
);
1652 fSetOpStack
.push(setStart
, *fStatus
);
1653 if ((fModeFlags
& UREGEX_CASE_INSENSITIVE
) != 0) {
1654 fSetOpStack
.push(setCaseClose
, *fStatus
);
1658 case doSetBeginIntersection1
:
1659 // We have scanned something like [[abc]&[
1660 // Need both the '&' operator and the open '[' operator.
1661 setPushOp(setIntersection1
);
1662 fSetOpStack
.push(setStart
, *fStatus
);
1663 if ((fModeFlags
& UREGEX_CASE_INSENSITIVE
) != 0) {
1664 fSetOpStack
.push(setCaseClose
, *fStatus
);
1668 case doSetBeginUnion
:
1669 // We have scanned something like [[abc][
1670 // Need to handle the union operation explicitly [[abc] | [
1671 setPushOp(setUnion
);
1672 fSetOpStack
.push(setStart
, *fStatus
);
1673 if ((fModeFlags
& UREGEX_CASE_INSENSITIVE
) != 0) {
1674 fSetOpStack
.push(setCaseClose
, *fStatus
);
1678 case doSetDifference2
:
1679 // We have scanned something like [abc--
1680 // Consider this to unambiguously be a set difference operator.
1681 setPushOp(setDifference2
);
1685 // Have encountered the ']' that closes a set.
1686 // Force the evaluation of any pending operations within this set,
1687 // leave the completed set on the top of the set stack.
1689 U_ASSERT(fSetOpStack
.peeki()==setStart
);
1695 // Finished a complete set expression, including all nested sets.
1696 // The close bracket has already triggered clearing out pending set operators,
1697 // the operator stack should be empty and the operand stack should have just
1698 // one entry, the result set.
1699 U_ASSERT(fSetOpStack
.empty());
1700 UnicodeSet
*theSet
= (UnicodeSet
*)fSetStack
.pop();
1701 U_ASSERT(fSetStack
.empty());
1706 case doSetIntersection2
:
1707 // Have scanned something like [abc&&
1708 setPushOp(setIntersection2
);
1712 // Union the just-scanned literal character into the set being built.
1713 // This operation is the highest precedence set operation, so we can always do
1714 // it immediately, without waiting to see what follows. It is necessary to perform
1715 // any pending '-' or '&' operation first, because these have the same precedence
1716 // as union-ing in a literal'
1719 UnicodeSet
*s
= (UnicodeSet
*)fSetStack
.peek();
1721 fLastSetLiteral
= fC
.fChar
;
1725 case doSetLiteralEscaped
:
1726 // A back-slash escaped literal character was encountered.
1727 // Processing is the same as with setLiteral, above, with the addition of
1728 // the optional check for errors on escaped ASCII letters.
1730 if ((fModeFlags
& UREGEX_ERROR_ON_UNKNOWN_ESCAPES
) != 0 &&
1731 ((fC
.fChar
>= 0x41 && fC
.fChar
<= 0x5A) || // in [A-Z]
1732 (fC
.fChar
>= 0x61 && fC
.fChar
<= 0x7a))) { // in [a-z]
1733 error(U_REGEX_BAD_ESCAPE_SEQUENCE
);
1736 UnicodeSet
*s
= (UnicodeSet
*)fSetStack
.peek();
1738 fLastSetLiteral
= fC
.fChar
;
1742 case doSetNamedChar
:
1743 // Scanning a \N{UNICODE CHARACTER NAME}
1744 // Aside from the source of the character, the processing is identical to doSetLiteral,
1747 UChar32 c
= scanNamedChar();
1749 UnicodeSet
*s
= (UnicodeSet
*)fSetStack
.peek();
1751 fLastSetLiteral
= c
;
1755 case doSetNamedRange
:
1756 // We have scanned literal-\N{CHAR NAME}. Add the range to the set.
1757 // The left character is already in the set, and is saved in fLastSetLiteral.
1758 // The right side needs to be picked up, the scan is at the 'N'.
1759 // Lower Limit > Upper limit being an error matches both Java
1760 // and ICU UnicodeSet behavior.
1762 UChar32 c
= scanNamedChar();
1763 if (U_SUCCESS(*fStatus
) && (fLastSetLiteral
== U_SENTINEL
|| fLastSetLiteral
> c
)) {
1764 error(U_REGEX_INVALID_RANGE
);
1766 UnicodeSet
*s
= (UnicodeSet
*)fSetStack
.peek();
1767 s
->add(fLastSetLiteral
, c
);
1768 fLastSetLiteral
= c
;
1774 // Scanned a '^' at the start of a set.
1775 // Push the negation operator onto the set op stack.
1776 // A twist for case-insensitive matching:
1777 // the case closure operation must happen _before_ negation.
1778 // But the case closure operation will already be on the stack if it's required.
1779 // This requires checking for case closure, and swapping the stack order
1780 // if it is present.
1782 int32_t tosOp
= fSetOpStack
.peeki();
1783 if (tosOp
== setCaseClose
) {
1785 fSetOpStack
.push(setNegation
, *fStatus
);
1786 fSetOpStack
.push(setCaseClose
, *fStatus
);
1788 fSetOpStack
.push(setNegation
, *fStatus
);
1793 case doSetNoCloseError
:
1794 error(U_REGEX_MISSING_CLOSE_BRACKET
);
1798 error(U_REGEX_RULE_SYNTAX
); // -- or && at the end of a set. Illegal.
1801 case doSetPosixProp
:
1803 UnicodeSet
*s
= scanPosixProp();
1805 UnicodeSet
*tos
= (UnicodeSet
*)fSetStack
.peek();
1808 } // else error. scanProp() reported the error status already.
1813 // Scanned a \p \P within [brackets].
1815 UnicodeSet
*s
= scanProp();
1817 UnicodeSet
*tos
= (UnicodeSet
*)fSetStack
.peek();
1820 } // else error. scanProp() reported the error status already.
1826 // We have scanned literal-literal. Add the range to the set.
1827 // The left character is already in the set, and is saved in fLastSetLiteral.
1828 // The right side is the current character.
1829 // Lower Limit > Upper limit being an error matches both Java
1830 // and ICU UnicodeSet behavior.
1833 if (fLastSetLiteral
== U_SENTINEL
|| fLastSetLiteral
> fC
.fChar
) {
1834 error(U_REGEX_INVALID_RANGE
);
1836 UnicodeSet
*s
= (UnicodeSet
*)fSetStack
.peek();
1837 s
->add(fLastSetLiteral
, fC
.fChar
);
1843 error(U_REGEX_INTERNAL_ERROR
);
1847 if (U_FAILURE(*fStatus
)) {
1856 //------------------------------------------------------------------------------
1858 // literalChar We've encountered a literal character from the pattern,
1859 // or an escape sequence that reduces to a character.
1860 // Add it to the string containing all literal chars/strings from
1863 //------------------------------------------------------------------------------
1864 void RegexCompile::literalChar(UChar32 c
) {
1865 fLiteralChars
.append(c
);
1869 //------------------------------------------------------------------------------
1871 // fixLiterals When compiling something that can follow a literal
1872 // string in a pattern, emit the code to match the
1873 // accumulated literal string.
1875 // Optionally, split the last char of the string off into
1876 // a single "ONE_CHAR" operation, so that quantifiers can
1877 // apply to that char alone. Example: abc*
1878 // The * must apply to the 'c' only.
1880 //------------------------------------------------------------------------------
1881 void RegexCompile::fixLiterals(UBool split
) {
1883 // If no literal characters have been scanned but not yet had code generated
1884 // for them, nothing needs to be done.
1885 if (fLiteralChars
.length() == 0) {
1889 int32_t indexOfLastCodePoint
= fLiteralChars
.moveIndex32(fLiteralChars
.length(), -1);
1890 UChar32 lastCodePoint
= fLiteralChars
.char32At(indexOfLastCodePoint
);
1892 // Split: We need to ensure that the last item in the compiled pattern
1893 // refers only to the last literal scanned in the pattern, so that
1894 // quantifiers (*, +, etc.) affect only it, and not a longer string.
1895 // Split before case folding for case insensitive matches.
1898 fLiteralChars
.truncate(indexOfLastCodePoint
);
1899 fixLiterals(FALSE
); // Recursive call, emit code to match the first part of the string.
1900 // Note that the truncated literal string may be empty, in which case
1901 // nothing will be emitted.
1903 literalChar(lastCodePoint
); // Re-add the last code point as if it were a new literal.
1904 fixLiterals(FALSE
); // Second recursive call, code for the final code point.
1908 // If we are doing case-insensitive matching, case fold the string. This may expand
1909 // the string, e.g. the German sharp-s turns into "ss"
1910 if (fModeFlags
& UREGEX_CASE_INSENSITIVE
) {
1911 fLiteralChars
.foldCase();
1912 indexOfLastCodePoint
= fLiteralChars
.moveIndex32(fLiteralChars
.length(), -1);
1913 lastCodePoint
= fLiteralChars
.char32At(indexOfLastCodePoint
);
1916 if (indexOfLastCodePoint
== 0) {
1917 // Single character, emit a URX_ONECHAR op to match it.
1918 if ((fModeFlags
& UREGEX_CASE_INSENSITIVE
) &&
1919 u_hasBinaryProperty(lastCodePoint
, UCHAR_CASE_SENSITIVE
)) {
1920 appendOp(URX_ONECHAR_I
, lastCodePoint
);
1922 appendOp(URX_ONECHAR
, lastCodePoint
);
1925 // Two or more chars, emit a URX_STRING to match them.
1926 if (fLiteralChars
.length() > 0x00ffffff || fRXPat
->fLiteralText
.length() > 0x00ffffff) {
1927 error(U_REGEX_PATTERN_TOO_BIG
);
1929 if (fModeFlags
& UREGEX_CASE_INSENSITIVE
) {
1930 appendOp(URX_STRING_I
, fRXPat
->fLiteralText
.length());
1932 // TODO here: add optimization to split case sensitive strings of length two
1933 // into two single char ops, for efficiency.
1934 appendOp(URX_STRING
, fRXPat
->fLiteralText
.length());
1936 appendOp(URX_STRING_LEN
, fLiteralChars
.length());
1938 // Add this string into the accumulated strings of the compiled pattern.
1939 fRXPat
->fLiteralText
.append(fLiteralChars
);
1942 fLiteralChars
.remove();
1946 int32_t RegexCompile::buildOp(int32_t type
, int32_t val
) {
1947 if (U_FAILURE(*fStatus
)) {
1950 if (type
< 0 || type
> 255) {
1952 error(U_REGEX_INTERNAL_ERROR
);
1953 type
= URX_RESERVED_OP
;
1955 if (val
> 0x00ffffff) {
1957 error(U_REGEX_INTERNAL_ERROR
);
1961 if (!(type
== URX_RESERVED_OP_N
|| type
== URX_RESERVED_OP
)) {
1963 error(U_REGEX_INTERNAL_ERROR
);
1966 if (URX_TYPE(val
) != 0xff) {
1968 error(U_REGEX_INTERNAL_ERROR
);
1971 type
= URX_RESERVED_OP_N
;
1973 return (type
<< 24) | val
;
1977 //------------------------------------------------------------------------------
1979 // appendOp() Append a new instruction onto the compiled pattern
1980 // Includes error checking, limiting the size of the
1981 // pattern to lengths that can be represented in the
1982 // 24 bit operand field of an instruction.
1984 //------------------------------------------------------------------------------
1985 void RegexCompile::appendOp(int32_t op
) {
1986 if (U_FAILURE(*fStatus
)) {
1989 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1990 if ((fRXPat
->fCompiledPat
->size() > 0x00fffff0) && U_SUCCESS(*fStatus
)) {
1991 error(U_REGEX_PATTERN_TOO_BIG
);
1995 void RegexCompile::appendOp(int32_t type
, int32_t val
) {
1996 appendOp(buildOp(type
, val
));
2000 //------------------------------------------------------------------------------
2002 // insertOp() Insert a slot for a new opcode into the already
2003 // compiled pattern code.
2005 // Fill the slot with a NOP. Our caller will replace it
2006 // with what they really wanted.
2008 //------------------------------------------------------------------------------
2009 void RegexCompile::insertOp(int32_t where
) {
2010 UVector64
*code
= fRXPat
->fCompiledPat
;
2011 U_ASSERT(where
>0 && where
< code
->size());
2013 int32_t nop
= buildOp(URX_NOP
, 0);
2014 code
->insertElementAt(nop
, where
, *fStatus
);
2016 // Walk through the pattern, looking for any ops with targets that
2017 // were moved down by the insert. Fix them.
2019 for (loc
=0; loc
<code
->size(); loc
++) {
2020 int32_t op
= (int32_t)code
->elementAti(loc
);
2021 int32_t opType
= URX_TYPE(op
);
2022 int32_t opValue
= URX_VAL(op
);
2023 if ((opType
== URX_JMP
||
2024 opType
== URX_JMPX
||
2025 opType
== URX_STATE_SAVE
||
2026 opType
== URX_CTR_LOOP
||
2027 opType
== URX_CTR_LOOP_NG
||
2028 opType
== URX_JMP_SAV
||
2029 opType
== URX_JMP_SAV_X
||
2030 opType
== URX_RELOC_OPRND
) && opValue
> where
) {
2031 // Target location for this opcode is after the insertion point and
2032 // needs to be incremented to adjust for the insertion.
2034 op
= buildOp(opType
, opValue
);
2035 code
->setElementAt(op
, loc
);
2039 // Now fix up the parentheses stack. All positive values in it are locations in
2040 // the compiled pattern. (Negative values are frame boundaries, and don't need fixing.)
2041 for (loc
=0; loc
<fParenStack
.size(); loc
++) {
2042 int32_t x
= fParenStack
.elementAti(loc
);
2043 U_ASSERT(x
< code
->size());
2046 fParenStack
.setElementAt(x
, loc
);
2050 if (fMatchCloseParen
> where
) {
2053 if (fMatchOpenParen
> where
) {
2059 //------------------------------------------------------------------------------
2061 // allocateData() Allocate storage in the matcher's static data area.
2062 // Return the index for the newly allocated data.
2063 // The storage won't actually exist until we are running a match
2064 // operation, but the storage indexes are inserted into various
2065 // opcodes while compiling the pattern.
2067 //------------------------------------------------------------------------------
2068 int32_t RegexCompile::allocateData(int32_t size
) {
2069 if (U_FAILURE(*fStatus
)) {
2072 if (size
<= 0 || size
> 0x100 || fRXPat
->fDataSize
< 0) {
2073 error(U_REGEX_INTERNAL_ERROR
);
2076 int32_t dataIndex
= fRXPat
->fDataSize
;
2077 fRXPat
->fDataSize
+= size
;
2078 if (fRXPat
->fDataSize
>= 0x00fffff0) {
2079 error(U_REGEX_INTERNAL_ERROR
);
2085 //------------------------------------------------------------------------------
2087 // allocateStackData() Allocate space in the back-tracking stack frame.
2088 // Return the index for the newly allocated data.
2089 // The frame indexes are inserted into various
2090 // opcodes while compiling the pattern, meaning that frame
2091 // size must be restricted to the size that will fit
2092 // as an operand (24 bits).
2094 //------------------------------------------------------------------------------
2095 int32_t RegexCompile::allocateStackData(int32_t size
) {
2096 if (U_FAILURE(*fStatus
)) {
2099 if (size
<= 0 || size
> 0x100 || fRXPat
->fFrameSize
< 0) {
2100 error(U_REGEX_INTERNAL_ERROR
);
2103 int32_t dataIndex
= fRXPat
->fFrameSize
;
2104 fRXPat
->fFrameSize
+= size
;
2105 if (fRXPat
->fFrameSize
>= 0x00fffff0) {
2106 error(U_REGEX_PATTERN_TOO_BIG
);
2112 //------------------------------------------------------------------------------
2114 // blockTopLoc() Find or create a location in the compiled pattern
2115 // at the start of the operation or block that has
2116 // just been compiled. Needed when a quantifier (* or
2117 // whatever) appears, and we need to add an operation
2118 // at the start of the thing being quantified.
2120 // (Parenthesized Blocks) have a slot with a NOP that
2121 // is reserved for this purpose. .* or similar don't
2122 // and a slot needs to be added.
2124 // parameter reserveLoc : TRUE - ensure that there is space to add an opcode
2125 // at the returned location.
2126 // FALSE - just return the address,
2127 // do not reserve a location there.
2129 //------------------------------------------------------------------------------
2130 int32_t RegexCompile::blockTopLoc(UBool reserveLoc
) {
2132 fixLiterals(TRUE
); // Emit code for any pending literals.
2133 // If last item was a string, emit separate op for the its last char.
2134 if (fRXPat
->fCompiledPat
->size() == fMatchCloseParen
)
2136 // The item just processed is a parenthesized block.
2137 theLoc
= fMatchOpenParen
; // A slot is already reserved for us.
2138 U_ASSERT(theLoc
> 0);
2139 U_ASSERT(URX_TYPE(((uint32_t)fRXPat
->fCompiledPat
->elementAti(theLoc
))) == URX_NOP
);
2142 // Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
2143 // No slot for STATE_SAVE was pre-reserved in the compiled code.
2144 // We need to make space now.
2145 theLoc
= fRXPat
->fCompiledPat
->size()-1;
2146 int32_t opAtTheLoc
= (int32_t)fRXPat
->fCompiledPat
->elementAti(theLoc
);
2147 if (URX_TYPE(opAtTheLoc
) == URX_STRING_LEN
) {
2148 // Strings take two opcode, we want the position of the first one.
2149 // We can have a string at this point if a single character case-folded to two.
2153 int32_t nop
= buildOp(URX_NOP
, 0);
2154 fRXPat
->fCompiledPat
->insertElementAt(nop
, theLoc
, *fStatus
);
2162 //------------------------------------------------------------------------------
2164 // handleCloseParen When compiling a close paren, we need to go back
2165 // and fix up any JMP or SAVE operations within the
2166 // parenthesized block that need to target the end
2167 // of the block. The locations of these are kept on
2168 // the paretheses stack.
2170 // This function is called both when encountering a
2171 // real ) and at the end of the pattern.
2173 //------------------------------------------------------------------------------
2174 void RegexCompile::handleCloseParen() {
2177 if (fParenStack
.size() <= 0) {
2178 error(U_REGEX_MISMATCHED_PAREN
);
2182 // Emit code for any pending literals.
2185 // Fixup any operations within the just-closed parenthesized group
2186 // that need to reference the end of the (block).
2187 // (The first one popped from the stack is an unused slot for
2188 // alternation (OR) state save, but applying the fixup to it does no harm.)
2190 patIdx
= fParenStack
.popi();
2192 // value < 0 flags the start of the frame on the paren stack.
2195 U_ASSERT(patIdx
>0 && patIdx
<= fRXPat
->fCompiledPat
->size());
2196 patOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(patIdx
);
2197 U_ASSERT(URX_VAL(patOp
) == 0); // Branch target for JMP should not be set.
2198 patOp
|= fRXPat
->fCompiledPat
->size(); // Set it now.
2199 fRXPat
->fCompiledPat
->setElementAt(patOp
, patIdx
);
2200 fMatchOpenParen
= patIdx
;
2203 // At the close of any parenthesized block, restore the match mode flags to
2204 // the value they had at the open paren. Saved value is
2205 // at the top of the paren stack.
2206 fModeFlags
= fParenStack
.popi();
2207 U_ASSERT(fModeFlags
< 0);
2209 // DO any additional fixups, depending on the specific kind of
2210 // parentesized grouping this is
2215 // No additional fixups required.
2216 // (Grouping-only parentheses)
2219 // Capturing Parentheses.
2220 // Insert a End Capture op into the pattern.
2221 // The frame offset of the variables for this cg is obtained from the
2222 // start capture op and put it into the end-capture op.
2224 int32_t captureOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
+1);
2225 U_ASSERT(URX_TYPE(captureOp
) == URX_START_CAPTURE
);
2227 int32_t frameVarLocation
= URX_VAL(captureOp
);
2228 appendOp(URX_END_CAPTURE
, frameVarLocation
);
2232 // Atomic Parenthesis.
2233 // Insert a LD_SP operation to restore the state stack to the position
2234 // it was when the atomic parens were entered.
2236 int32_t stoOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
+1);
2237 U_ASSERT(URX_TYPE(stoOp
) == URX_STO_SP
);
2238 int32_t stoLoc
= URX_VAL(stoOp
);
2239 appendOp(URX_LD_SP
, stoLoc
);
2245 int32_t startOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
-5);
2246 U_ASSERT(URX_TYPE(startOp
) == URX_LA_START
);
2247 int32_t dataLoc
= URX_VAL(startOp
);
2248 appendOp(URX_LA_END
, dataLoc
);
2254 // See comment at doOpenLookAheadNeg
2255 int32_t startOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
-1);
2256 U_ASSERT(URX_TYPE(startOp
) == URX_LA_START
);
2257 int32_t dataLoc
= URX_VAL(startOp
);
2258 appendOp(URX_LA_END
, dataLoc
);
2259 appendOp(URX_BACKTRACK
, 0);
2260 appendOp(URX_LA_END
, dataLoc
);
2262 // Patch the URX_SAVE near the top of the block.
2263 // The destination of the SAVE is the final LA_END that was just added.
2264 int32_t saveOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
);
2265 U_ASSERT(URX_TYPE(saveOp
) == URX_STATE_SAVE
);
2266 int32_t dest
= fRXPat
->fCompiledPat
->size()-1;
2267 saveOp
= buildOp(URX_STATE_SAVE
, dest
);
2268 fRXPat
->fCompiledPat
->setElementAt(saveOp
, fMatchOpenParen
);
2274 // See comment at doOpenLookBehind.
2276 // Append the URX_LB_END and URX_LA_END to the compiled pattern.
2277 int32_t startOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
-4);
2278 U_ASSERT(URX_TYPE(startOp
) == URX_LB_START
);
2279 int32_t dataLoc
= URX_VAL(startOp
);
2280 appendOp(URX_LB_END
, dataLoc
);
2281 appendOp(URX_LA_END
, dataLoc
);
2283 // Determine the min and max bounds for the length of the
2284 // string that the pattern can match.
2285 // An unbounded upper limit is an error.
2286 int32_t patEnd
= fRXPat
->fCompiledPat
->size() - 1;
2287 int32_t minML
= minMatchLength(fMatchOpenParen
, patEnd
);
2288 int32_t maxML
= maxMatchLength(fMatchOpenParen
, patEnd
);
2289 if (URX_TYPE(maxML
) != 0) {
2290 error(U_REGEX_LOOK_BEHIND_LIMIT
);
2293 if (maxML
== INT32_MAX
) {
2294 error(U_REGEX_LOOK_BEHIND_LIMIT
);
2297 U_ASSERT(minML
<= maxML
);
2299 // Insert the min and max match len bounds into the URX_LB_CONT op that
2300 // appears at the top of the look-behind block, at location fMatchOpenParen+1
2301 fRXPat
->fCompiledPat
->setElementAt(minML
, fMatchOpenParen
-2);
2302 fRXPat
->fCompiledPat
->setElementAt(maxML
, fMatchOpenParen
-1);
2311 // See comment at doOpenLookBehindNeg.
2313 // Append the URX_LBN_END to the compiled pattern.
2314 int32_t startOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
-5);
2315 U_ASSERT(URX_TYPE(startOp
) == URX_LB_START
);
2316 int32_t dataLoc
= URX_VAL(startOp
);
2317 appendOp(URX_LBN_END
, dataLoc
);
2319 // Determine the min and max bounds for the length of the
2320 // string that the pattern can match.
2321 // An unbounded upper limit is an error.
2322 int32_t patEnd
= fRXPat
->fCompiledPat
->size() - 1;
2323 int32_t minML
= minMatchLength(fMatchOpenParen
, patEnd
);
2324 int32_t maxML
= maxMatchLength(fMatchOpenParen
, patEnd
);
2325 if (URX_TYPE(maxML
) != 0) {
2326 error(U_REGEX_LOOK_BEHIND_LIMIT
);
2329 if (maxML
== INT32_MAX
) {
2330 error(U_REGEX_LOOK_BEHIND_LIMIT
);
2333 U_ASSERT(minML
<= maxML
);
2335 // Insert the min and max match len bounds into the URX_LB_CONT op that
2336 // appears at the top of the look-behind block, at location fMatchOpenParen+1
2337 fRXPat
->fCompiledPat
->setElementAt(minML
, fMatchOpenParen
-3);
2338 fRXPat
->fCompiledPat
->setElementAt(maxML
, fMatchOpenParen
-2);
2340 // Insert the pattern location to continue at after a successful match
2341 // as the last operand of the URX_LBN_CONT
2342 int32_t op
= buildOp(URX_RELOC_OPRND
, fRXPat
->fCompiledPat
->size());
2343 fRXPat
->fCompiledPat
->setElementAt(op
, fMatchOpenParen
-1);
2353 // remember the next location in the compiled pattern.
2354 // The compilation of Quantifiers will look at this to see whether its looping
2355 // over a parenthesized block or a single item
2356 fMatchCloseParen
= fRXPat
->fCompiledPat
->size();
2361 //------------------------------------------------------------------------------
2363 // compileSet Compile the pattern operations for a reference to a
2366 //------------------------------------------------------------------------------
2367 void RegexCompile::compileSet(UnicodeSet
*theSet
)
2369 if (theSet
== NULL
) {
2372 // Remove any strings from the set.
2373 // There shoudn't be any, but just in case.
2374 // (Case Closure can add them; if we had a simple case closure avaialble that
2375 // ignored strings, that would be better.)
2376 theSet
->removeAllStrings();
2377 int32_t setSize
= theSet
->size();
2382 // Set of no elements. Always fails to match.
2383 appendOp(URX_BACKTRACK
, 0);
2390 // The set contains only a single code point. Put it into
2391 // the compiled pattern as a single char operation rather
2392 // than a set, and discard the set itself.
2393 literalChar(theSet
->charAt(0));
2400 // The set contains two or more chars. (the normal case)
2401 // Put it into the compiled pattern as a set.
2402 int32_t setNumber
= fRXPat
->fSets
->size();
2403 fRXPat
->fSets
->addElement(theSet
, *fStatus
);
2404 appendOp(URX_SETREF
, setNumber
);
2410 //------------------------------------------------------------------------------
2412 // compileInterval Generate the code for a {min, max} style interval quantifier.
2413 // Except for the specific opcodes used, the code is the same
2414 // for all three types (greedy, non-greedy, possessive) of
2415 // intervals. The opcodes are supplied as parameters.
2416 // (There are two sets of opcodes - greedy & possessive use the
2417 // same ones, while non-greedy has it's own.)
2419 // The code for interval loops has this form:
2420 // 0 CTR_INIT counter loc (in stack frame)
2421 // 1 5 patt address of CTR_LOOP at bottom of block
2423 // 3 max count (-1 for unbounded)
2424 // 4 ... block to be iterated over
2428 //------------------------------------------------------------------------------
2429 void RegexCompile::compileInterval(int32_t InitOp
, int32_t LoopOp
)
2431 // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
2432 // four slots in the compiled code. Reserve them.
2433 int32_t topOfBlock
= blockTopLoc(TRUE
);
2434 insertOp(topOfBlock
);
2435 insertOp(topOfBlock
);
2436 insertOp(topOfBlock
);
2438 // The operands for the CTR_INIT opcode include the index in the matcher data
2439 // of the counter. Allocate it now. There are two data items
2440 // counterLoc --> Loop counter
2441 // +1 --> Input index (for breaking non-progressing loops)
2442 // (Only present if unbounded upper limit on loop)
2443 int32_t dataSize
= fIntervalUpper
< 0 ? 2 : 1;
2444 int32_t counterLoc
= allocateStackData(dataSize
);
2446 int32_t op
= buildOp(InitOp
, counterLoc
);
2447 fRXPat
->fCompiledPat
->setElementAt(op
, topOfBlock
);
2449 // The second operand of CTR_INIT is the location following the end of the loop.
2450 // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
2451 // compilation of something later on causes the code to grow and the target
2452 // position to move.
2453 int32_t loopEnd
= fRXPat
->fCompiledPat
->size();
2454 op
= buildOp(URX_RELOC_OPRND
, loopEnd
);
2455 fRXPat
->fCompiledPat
->setElementAt(op
, topOfBlock
+1);
2457 // Followed by the min and max counts.
2458 fRXPat
->fCompiledPat
->setElementAt(fIntervalLow
, topOfBlock
+2);
2459 fRXPat
->fCompiledPat
->setElementAt(fIntervalUpper
, topOfBlock
+3);
2461 // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op.
2462 // Goes at end of the block being looped over, so just append to the code so far.
2463 appendOp(LoopOp
, topOfBlock
);
2465 if ((fIntervalLow
& 0xff000000) != 0 ||
2466 (fIntervalUpper
> 0 && (fIntervalUpper
& 0xff000000) != 0)) {
2467 error(U_REGEX_NUMBER_TOO_BIG
);
2470 if (fIntervalLow
> fIntervalUpper
&& fIntervalUpper
!= -1) {
2471 error(U_REGEX_MAX_LT_MIN
);
2477 UBool
RegexCompile::compileInlineInterval() {
2478 if (fIntervalUpper
> 10 || fIntervalUpper
< fIntervalLow
) {
2479 // Too big to inline. Fail, which will cause looping code to be generated.
2480 // (Upper < Lower picks up unbounded upper and errors, both.)
2484 int32_t topOfBlock
= blockTopLoc(FALSE
);
2485 if (fIntervalUpper
== 0) {
2486 // Pathological case. Attempt no matches, as if the block doesn't exist.
2487 // Discard the generated code for the block.
2488 // If the block included parens, discard the info pertaining to them as well.
2489 fRXPat
->fCompiledPat
->setSize(topOfBlock
);
2490 if (fMatchOpenParen
>= topOfBlock
) {
2491 fMatchOpenParen
= -1;
2493 if (fMatchCloseParen
>= topOfBlock
) {
2494 fMatchCloseParen
= -1;
2499 if (topOfBlock
!= fRXPat
->fCompiledPat
->size()-1 && fIntervalUpper
!= 1) {
2500 // The thing being repeated is not a single op, but some
2501 // more complex block. Do it as a loop, not inlines.
2502 // Note that things "repeated" a max of once are handled as inline, because
2503 // the one copy of the code already generated is just fine.
2507 // Pick up the opcode that is to be repeated
2509 int32_t op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(topOfBlock
);
2511 // Compute the pattern location where the inline sequence
2512 // will end, and set up the state save op that will be needed.
2514 int32_t endOfSequenceLoc
= fRXPat
->fCompiledPat
->size()-1
2515 + fIntervalUpper
+ (fIntervalUpper
-fIntervalLow
);
2516 int32_t saveOp
= buildOp(URX_STATE_SAVE
, endOfSequenceLoc
);
2517 if (fIntervalLow
== 0) {
2518 insertOp(topOfBlock
);
2519 fRXPat
->fCompiledPat
->setElementAt(saveOp
, topOfBlock
);
2524 // Loop, emitting the op for the thing being repeated each time.
2525 // Loop starts at 1 because one instance of the op already exists in the pattern,
2526 // it was put there when it was originally encountered.
2528 for (i
=1; i
<fIntervalUpper
; i
++ ) {
2529 if (i
>= fIntervalLow
) {
2539 //------------------------------------------------------------------------------
2541 // caseInsensitiveStart given a single code point from a pattern string, determine the
2542 // set of characters that could potentially begin a case-insensitive
2543 // match of a string beginning with that character, using full Unicode
2544 // case insensitive matching.
2546 // This is used in optimizing find().
2548 // closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
2549 // misses cases like this:
2550 // A string from the pattern begins with 'ss' (although all we know
2551 // in this context is that it begins with 's')
2552 // The pattern could match a string beginning with a German sharp-s
2554 // To the ordinary case closure for a character c, we add all other
2555 // characters cx where the case closure of cx incudes a string form that begins
2556 // with the original character c.
2558 // This function could be made smarter. The full pattern string is available
2559 // and it would be possible to verify that the extra characters being added
2560 // to the starting set fully match, rather than having just a first-char of the
2561 // folded form match.
2563 //------------------------------------------------------------------------------
2564 void RegexCompile::findCaseInsensitiveStarters(UChar32 c
, UnicodeSet
*starterChars
) {
2566 // Machine Generated below.
2567 // It may need updating with new versions of Unicode.
2568 // Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed.
2569 // The update tool is here: svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
2571 // Machine Generated Data. Do not hand edit.
2572 static const UChar32 RECaseFixCodePoints
[] = {
2573 0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc,
2574 0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565,
2575 0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07,
2576 0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61,
2577 0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000};
2579 static const int16_t RECaseFixStringOffsets
[] = {
2580 0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10,
2581 0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f,
2582 0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43,
2583 0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57,
2584 0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0};
2586 static const int16_t RECaseFixCounts
[] = {
2587 0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1,
2588 0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1,
2589 0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2590 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2591 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0};
2593 static const UChar RECaseFixData
[] = {
2594 0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf,
2595 0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3,
2596 0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3,
2597 0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0x1fe3,
2598 0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0xfb14,
2599 0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83,
2600 0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90,
2601 0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95,
2602 0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2,
2603 0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7,
2604 0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0};
2606 // End of machine generated data.
2608 if (c
< UCHAR_MIN_VALUE
|| c
> UCHAR_MAX_VALUE
) {
2609 // This function should never be called with an invalid input character.
2611 starterChars
->clear();
2612 } else if (u_hasBinaryProperty(c
, UCHAR_CASE_SENSITIVE
)) {
2613 UChar32 caseFoldedC
= u_foldCase(c
, U_FOLD_CASE_DEFAULT
);
2614 starterChars
->set(caseFoldedC
, caseFoldedC
);
2617 for (i
=0; RECaseFixCodePoints
[i
]<c
; i
++) {
2618 // Simple linear search through the sorted list of interesting code points.
2621 if (RECaseFixCodePoints
[i
] == c
) {
2622 int32_t dataIndex
= RECaseFixStringOffsets
[i
];
2623 int32_t numCharsToAdd
= RECaseFixCounts
[i
];
2624 UChar32 cpToAdd
= 0;
2625 for (int32_t j
=0; j
<numCharsToAdd
; j
++) {
2626 U16_NEXT_UNSAFE(RECaseFixData
, dataIndex
, cpToAdd
);
2627 starterChars
->add(cpToAdd
);
2631 starterChars
->closeOver(USET_CASE_INSENSITIVE
);
2632 starterChars
->removeAllStrings();
2634 // Not a cased character. Just return it alone.
2635 starterChars
->set(c
, c
);
2640 // Increment with overflow check.
2641 // val and delta will both be positive.
2643 static int32_t safeIncrement(int32_t val
, int32_t delta
) {
2644 if (INT32_MAX
- val
> delta
) {
2652 //------------------------------------------------------------------------------
2654 // matchStartType Determine how a match can start.
2655 // Used to optimize find() operations.
2657 // Operation is very similar to minMatchLength(). Walk the compiled
2658 // pattern, keeping an on-going minimum-match-length. For any
2659 // op where the min match coming in is zero, add that ops possible
2660 // starting matches to the possible starts for the overall pattern.
2662 //------------------------------------------------------------------------------
2663 void RegexCompile::matchStartType() {
2664 if (U_FAILURE(*fStatus
)) {
2669 int32_t loc
; // Location in the pattern of the current op being processed.
2670 int32_t op
; // The op being processed
2671 int32_t opType
; // The opcode type of the op
2672 int32_t currentLen
= 0; // Minimum length of a match to this point (loc) in the pattern
2673 int32_t numInitialStrings
= 0; // Number of strings encountered that could match at start.
2675 UBool atStart
= TRUE
; // True if no part of the pattern yet encountered
2676 // could have advanced the position in a match.
2677 // (Maximum match length so far == 0)
2679 // forwardedLength is a vector holding minimum-match-length values that
2680 // are propagated forward in the pattern by JMP or STATE_SAVE operations.
2681 // It must be one longer than the pattern being checked because some ops
2682 // will jmp to a end-of-block+1 location from within a block, and we must
2683 // count those when checking the block.
2684 int32_t end
= fRXPat
->fCompiledPat
->size();
2685 UVector32
forwardedLength(end
+1, *fStatus
);
2686 forwardedLength
.setSize(end
+1);
2687 for (loc
=3; loc
<end
; loc
++) {
2688 forwardedLength
.setElementAt(INT32_MAX
, loc
);
2691 for (loc
= 3; loc
<end
; loc
++) {
2692 op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
2693 opType
= URX_TYPE(op
);
2695 // The loop is advancing linearly through the pattern.
2696 // If the op we are now at was the destination of a branch in the pattern,
2697 // and that path has a shorter minimum length than the current accumulated value,
2698 // replace the current accumulated value.
2699 if (forwardedLength
.elementAti(loc
) < currentLen
) {
2700 currentLen
= forwardedLength
.elementAti(loc
);
2701 U_ASSERT(currentLen
>=0 && currentLen
< INT32_MAX
);
2705 // Ops that don't change the total length matched
2706 case URX_RESERVED_OP
:
2709 case URX_STRING_LEN
:
2711 case URX_START_CAPTURE
:
2712 case URX_END_CAPTURE
:
2713 case URX_BACKSLASH_B
:
2714 case URX_BACKSLASH_BU
:
2715 case URX_BACKSLASH_G
:
2716 case URX_BACKSLASH_Z
:
2721 case URX_RELOC_OPRND
:
2722 case URX_STO_INP_LOC
:
2723 case URX_BACKREF
: // BackRef. Must assume that it might be a zero length match
2726 case URX_STO_SP
: // Setup for atomic or possessive blocks. Doesn't change what can match.
2732 fRXPat
->fStartType
= START_START
;
2737 case URX_CARET_M_UNIX
:
2739 fRXPat
->fStartType
= START_LINE
;
2744 if (currentLen
== 0) {
2745 // This character could appear at the start of a match.
2746 // Add it to the set of possible starting characters.
2747 fRXPat
->fInitialChars
->add(URX_VAL(op
));
2748 numInitialStrings
+= 2;
2750 currentLen
= safeIncrement(currentLen
, 1);
2756 if (currentLen
== 0) {
2757 int32_t sn
= URX_VAL(op
);
2758 U_ASSERT(sn
> 0 && sn
< fRXPat
->fSets
->size());
2759 const UnicodeSet
*s
= (UnicodeSet
*)fRXPat
->fSets
->elementAt(sn
);
2760 fRXPat
->fInitialChars
->addAll(*s
);
2761 numInitialStrings
+= 2;
2763 currentLen
= safeIncrement(currentLen
, 1);
2768 // [Set]*, like a SETREF, above, in what it can match,
2769 // but may not match at all, so currentLen is not incremented.
2770 if (currentLen
== 0) {
2771 int32_t sn
= URX_VAL(op
);
2772 U_ASSERT(sn
> 0 && sn
< fRXPat
->fSets
->size());
2773 const UnicodeSet
*s
= (UnicodeSet
*)fRXPat
->fSets
->elementAt(sn
);
2774 fRXPat
->fInitialChars
->addAll(*s
);
2775 numInitialStrings
+= 2;
2780 case URX_LOOP_DOT_I
:
2781 if (currentLen
== 0) {
2782 // .* at the start of a pattern.
2783 // Any character can begin the match.
2784 fRXPat
->fInitialChars
->clear();
2785 fRXPat
->fInitialChars
->complement();
2786 numInitialStrings
+= 2;
2792 case URX_STATIC_SETREF
:
2793 if (currentLen
== 0) {
2794 int32_t sn
= URX_VAL(op
);
2795 U_ASSERT(sn
>0 && sn
<URX_LAST_SET
);
2796 const UnicodeSet
*s
= fRXPat
->fStaticSets
[sn
];
2797 fRXPat
->fInitialChars
->addAll(*s
);
2798 numInitialStrings
+= 2;
2800 currentLen
= safeIncrement(currentLen
, 1);
2806 case URX_STAT_SETREF_N
:
2807 if (currentLen
== 0) {
2808 int32_t sn
= URX_VAL(op
);
2809 const UnicodeSet
*s
= fRXPat
->fStaticSets
[sn
];
2812 fRXPat
->fInitialChars
->addAll(sc
);
2813 numInitialStrings
+= 2;
2815 currentLen
= safeIncrement(currentLen
, 1);
2821 case URX_BACKSLASH_D
:
2823 if (currentLen
== 0) {
2825 s
.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK
, U_GC_ND_MASK
, *fStatus
);
2826 if (URX_VAL(op
) != 0) {
2829 fRXPat
->fInitialChars
->addAll(s
);
2830 numInitialStrings
+= 2;
2832 currentLen
= safeIncrement(currentLen
, 1);
2837 case URX_BACKSLASH_H
:
2838 // Horiz white space
2839 if (currentLen
== 0) {
2841 s
.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK
, U_GC_ZS_MASK
, *fStatus
);
2842 s
.add((UChar32
)9); // Tab
2843 if (URX_VAL(op
) != 0) {
2846 fRXPat
->fInitialChars
->addAll(s
);
2847 numInitialStrings
+= 2;
2849 currentLen
= safeIncrement(currentLen
, 1);
2854 case URX_BACKSLASH_R
: // Any line ending sequence
2855 case URX_BACKSLASH_V
: // Any line ending code point, with optional negation
2856 if (currentLen
== 0) {
2858 s
.add((UChar32
)0x0a, (UChar32
)0x0d); // add range
2859 s
.add((UChar32
)0x85);
2860 s
.add((UChar32
)0x2028, (UChar32
)0x2029);
2861 if (URX_VAL(op
) != 0) {
2862 // Complement option applies to URX_BACKSLASH_V only.
2865 fRXPat
->fInitialChars
->addAll(s
);
2866 numInitialStrings
+= 2;
2868 currentLen
= safeIncrement(currentLen
, 1);
2875 // Case Insensitive Single Character.
2876 if (currentLen
== 0) {
2877 UChar32 c
= URX_VAL(op
);
2878 if (u_hasBinaryProperty(c
, UCHAR_CASE_SENSITIVE
)) {
2879 UnicodeSet
starters(c
, c
);
2880 starters
.closeOver(USET_CASE_INSENSITIVE
);
2881 // findCaseInsensitiveStarters(c, &starters);
2882 // For ONECHAR_I, no need to worry about text chars that expand on folding into strings.
2883 // The expanded folding can't match the pattern.
2884 fRXPat
->fInitialChars
->addAll(starters
);
2886 // Char has no case variants. Just add it as-is to the
2887 // set of possible starting chars.
2888 fRXPat
->fInitialChars
->add(c
);
2890 numInitialStrings
+= 2;
2892 currentLen
= safeIncrement(currentLen
, 1);
2897 case URX_BACKSLASH_X
: // Grahpeme Cluster. Minimum is 1, max unbounded.
2898 case URX_DOTANY_ALL
: // . matches one or two.
2900 case URX_DOTANY_UNIX
:
2901 if (currentLen
== 0) {
2902 // These constructs are all bad news when they appear at the start
2903 // of a match. Any character can begin the match.
2904 fRXPat
->fInitialChars
->clear();
2905 fRXPat
->fInitialChars
->complement();
2906 numInitialStrings
+= 2;
2908 currentLen
= safeIncrement(currentLen
, 1);
2914 loc
++; // Except for extra operand on URX_JMPX, same as URX_JMP.
2918 int32_t jmpDest
= URX_VAL(op
);
2919 if (jmpDest
< loc
) {
2920 // Loop of some kind. Can safely ignore, the worst that will happen
2921 // is that we understate the true minimum length
2922 currentLen
= forwardedLength
.elementAti(loc
+1);
2925 // Forward jump. Propagate the current min length to the target loc of the jump.
2926 U_ASSERT(jmpDest
<= end
+1);
2927 if (forwardedLength
.elementAti(jmpDest
) > currentLen
) {
2928 forwardedLength
.setElementAt(currentLen
, jmpDest
);
2937 // Combo of state save to the next loc, + jmp backwards.
2938 // Net effect on min. length computation is nothing.
2943 // Fails are kind of like a branch, except that the min length was
2944 // propagated already, by the state save.
2945 currentLen
= forwardedLength
.elementAti(loc
+1);
2950 case URX_STATE_SAVE
:
2952 // State Save, for forward jumps, propagate the current minimum.
2953 // of the state save.
2954 int32_t jmpDest
= URX_VAL(op
);
2955 if (jmpDest
> loc
) {
2956 if (currentLen
< forwardedLength
.elementAti(jmpDest
)) {
2957 forwardedLength
.setElementAt(currentLen
, jmpDest
);
2970 int32_t stringLenOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
2971 int32_t stringLen
= URX_VAL(stringLenOp
);
2972 U_ASSERT(URX_TYPE(stringLenOp
) == URX_STRING_LEN
);
2973 U_ASSERT(stringLenOp
>= 2);
2974 if (currentLen
== 0) {
2975 // Add the starting character of this string to the set of possible starting
2976 // characters for this pattern.
2977 int32_t stringStartIdx
= URX_VAL(op
);
2978 UChar32 c
= fRXPat
->fLiteralText
.char32At(stringStartIdx
);
2979 fRXPat
->fInitialChars
->add(c
);
2981 // Remember this string. After the entire pattern has been checked,
2982 // if nothing else is identified that can start a match, we'll use it.
2983 numInitialStrings
++;
2984 fRXPat
->fInitialStringIdx
= stringStartIdx
;
2985 fRXPat
->fInitialStringLen
= stringLen
;
2988 currentLen
= safeIncrement(currentLen
, stringLen
);
2995 // Case-insensitive string. Unlike exact-match strings, we won't
2996 // attempt a string search for possible match positions. But we
2997 // do update the set of possible starting characters.
2999 int32_t stringLenOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
3000 int32_t stringLen
= URX_VAL(stringLenOp
);
3001 U_ASSERT(URX_TYPE(stringLenOp
) == URX_STRING_LEN
);
3002 U_ASSERT(stringLenOp
>= 2);
3003 if (currentLen
== 0) {
3004 // Add the starting character of this string to the set of possible starting
3005 // characters for this pattern.
3006 int32_t stringStartIdx
= URX_VAL(op
);
3007 UChar32 c
= fRXPat
->fLiteralText
.char32At(stringStartIdx
);
3009 findCaseInsensitiveStarters(c
, &s
);
3010 fRXPat
->fInitialChars
->addAll(s
);
3011 numInitialStrings
+= 2; // Matching on an initial string not possible.
3013 currentLen
= safeIncrement(currentLen
, stringLen
);
3019 case URX_CTR_INIT_NG
:
3021 // Loop Init Ops. These don't change the min length, but they are 4 word ops
3022 // so location must be updated accordingly.
3024 // If the min loop count == 0
3025 // move loc forwards to the end of the loop, skipping over the body.
3026 // If the min count is > 0,
3027 // continue normal processing of the body of the loop.
3028 int32_t loopEndLoc
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
+1);
3029 loopEndLoc
= URX_VAL(loopEndLoc
);
3030 int32_t minLoopCount
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
+2);
3031 if (minLoopCount
== 0) {
3032 // Min Loop Count of 0, treat like a forward branch and
3033 // move the current minimum length up to the target
3034 // (end of loop) location.
3035 U_ASSERT(loopEndLoc
<= end
+1);
3036 if (forwardedLength
.elementAti(loopEndLoc
) > currentLen
) {
3037 forwardedLength
.setElementAt(currentLen
, loopEndLoc
);
3040 loc
+=3; // Skips over operands of CTR_INIT
3047 case URX_CTR_LOOP_NG
:
3049 // The jump is conditional, backwards only.
3054 // More loop ops. These state-save to themselves.
3055 // don't change the minimum match
3063 // Look-around. Scan forward until the matching look-ahead end,
3064 // without processing the look-around block. This is overly pessimistic.
3066 // Keep track of the nesting depth of look-around blocks. Boilerplate code for
3067 // lookahead contains two LA_END instructions, so count goes up by two
3068 // for each LA_START.
3069 int32_t depth
= (opType
== URX_LA_START
? 2: 1);
3072 op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
3073 if (URX_TYPE(op
) == URX_LA_START
) {
3076 if (URX_TYPE(op
) == URX_LB_START
) {
3079 if (URX_TYPE(op
) == URX_LA_END
|| URX_TYPE(op
)==URX_LBN_END
) {
3085 if (URX_TYPE(op
) == URX_STATE_SAVE
) {
3086 // Need this because neg lookahead blocks will FAIL to outside
3088 int32_t jmpDest
= URX_VAL(op
);
3089 if (jmpDest
> loc
) {
3090 if (currentLen
< forwardedLength
.elementAti(jmpDest
)) {
3091 forwardedLength
.setElementAt(currentLen
, jmpDest
);
3095 U_ASSERT(loc
<= end
);
3105 U_ASSERT(FALSE
); // Shouldn't get here. These ops should be
3106 // consumed by the scan in URX_LA_START and LB_START
3117 // We have finished walking through the ops. Check whether some forward jump
3118 // propagated a shorter length to location end+1.
3119 if (forwardedLength
.elementAti(end
+1) < currentLen
) {
3120 currentLen
= forwardedLength
.elementAti(end
+1);
3124 fRXPat
->fInitialChars8
->init(fRXPat
->fInitialChars
);
3127 // Sort out what we should check for when looking for candidate match start positions.
3128 // In order of preference,
3129 // 1. Start of input text buffer.
3130 // 2. A literal string.
3131 // 3. Start of line in multi-line mode.
3132 // 4. A single literal character.
3133 // 5. A character from a set of characters.
3135 if (fRXPat
->fStartType
== START_START
) {
3136 // Match only at the start of an input text string.
3137 // start type is already set. We're done.
3138 } else if (numInitialStrings
== 1 && fRXPat
->fMinMatchLen
> 0) {
3139 // Match beginning only with a literal string.
3140 UChar32 c
= fRXPat
->fLiteralText
.char32At(fRXPat
->fInitialStringIdx
);
3141 U_ASSERT(fRXPat
->fInitialChars
->contains(c
));
3142 fRXPat
->fStartType
= START_STRING
;
3143 fRXPat
->fInitialChar
= c
;
3144 } else if (fRXPat
->fStartType
== START_LINE
) {
3145 // Match at start of line in Multi-Line mode.
3146 // Nothing to do here; everything is already set.
3147 } else if (fRXPat
->fMinMatchLen
== 0) {
3148 // Zero length match possible. We could start anywhere.
3149 fRXPat
->fStartType
= START_NO_INFO
;
3150 } else if (fRXPat
->fInitialChars
->size() == 1) {
3151 // All matches begin with the same char.
3152 fRXPat
->fStartType
= START_CHAR
;
3153 fRXPat
->fInitialChar
= fRXPat
->fInitialChars
->charAt(0);
3154 U_ASSERT(fRXPat
->fInitialChar
!= (UChar32
)-1);
3155 } else if (fRXPat
->fInitialChars
->contains((UChar32
)0, (UChar32
)0x10ffff) == FALSE
&&
3156 fRXPat
->fMinMatchLen
> 0) {
3157 // Matches start with a set of character smaller than the set of all chars.
3158 fRXPat
->fStartType
= START_SET
;
3160 // Matches can start with anything
3161 fRXPat
->fStartType
= START_NO_INFO
;
3169 //------------------------------------------------------------------------------
3171 // minMatchLength Calculate the length of the shortest string that could
3172 // match the specified pattern.
3173 // Length is in 16 bit code units, not code points.
3175 // The calculated length may not be exact. The returned
3176 // value may be shorter than the actual minimum; it must
3179 // start and end are the range of p-code operations to be
3180 // examined. The endpoints are included in the range.
3182 //------------------------------------------------------------------------------
3183 int32_t RegexCompile::minMatchLength(int32_t start
, int32_t end
) {
3184 if (U_FAILURE(*fStatus
)) {
3188 U_ASSERT(start
<= end
);
3189 U_ASSERT(end
< fRXPat
->fCompiledPat
->size());
3195 int32_t currentLen
= 0;
3198 // forwardedLength is a vector holding minimum-match-length values that
3199 // are propagated forward in the pattern by JMP or STATE_SAVE operations.
3200 // It must be one longer than the pattern being checked because some ops
3201 // will jmp to a end-of-block+1 location from within a block, and we must
3202 // count those when checking the block.
3203 UVector32
forwardedLength(end
+2, *fStatus
);
3204 forwardedLength
.setSize(end
+2);
3205 for (loc
=start
; loc
<=end
+1; loc
++) {
3206 forwardedLength
.setElementAt(INT32_MAX
, loc
);
3209 for (loc
= start
; loc
<=end
; loc
++) {
3210 op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
3211 opType
= URX_TYPE(op
);
3213 // The loop is advancing linearly through the pattern.
3214 // If the op we are now at was the destination of a branch in the pattern,
3215 // and that path has a shorter minimum length than the current accumulated value,
3216 // replace the current accumulated value.
3217 // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); // MinLength == INT32_MAX for some
3218 // no-match-possible cases.
3219 if (forwardedLength
.elementAti(loc
) < currentLen
) {
3220 currentLen
= forwardedLength
.elementAti(loc
);
3221 U_ASSERT(currentLen
>=0 && currentLen
< INT32_MAX
);
3225 // Ops that don't change the total length matched
3226 case URX_RESERVED_OP
:
3228 case URX_STRING_LEN
:
3230 case URX_START_CAPTURE
:
3231 case URX_END_CAPTURE
:
3232 case URX_BACKSLASH_B
:
3233 case URX_BACKSLASH_BU
:
3234 case URX_BACKSLASH_G
:
3235 case URX_BACKSLASH_Z
:
3241 case URX_RELOC_OPRND
:
3242 case URX_STO_INP_LOC
:
3244 case URX_CARET_M_UNIX
:
3245 case URX_BACKREF
: // BackRef. Must assume that it might be a zero length match
3248 case URX_STO_SP
: // Setup for atomic or possessive blocks. Doesn't change what can match.
3256 // Ops that match a minimum of one character (one or two 16 bit code units.)
3259 case URX_STATIC_SETREF
:
3260 case URX_STAT_SETREF_N
:
3262 case URX_BACKSLASH_D
:
3263 case URX_BACKSLASH_H
:
3264 case URX_BACKSLASH_R
:
3265 case URX_BACKSLASH_V
:
3267 case URX_BACKSLASH_X
: // Grahpeme Cluster. Minimum is 1, max unbounded.
3268 case URX_DOTANY_ALL
: // . matches one or two.
3270 case URX_DOTANY_UNIX
:
3271 currentLen
= safeIncrement(currentLen
, 1);
3276 loc
++; // URX_JMPX has an extra operand, ignored here,
3277 // otherwise processed identically to URX_JMP.
3281 int32_t jmpDest
= URX_VAL(op
);
3282 if (jmpDest
< loc
) {
3283 // Loop of some kind. Can safely ignore, the worst that will happen
3284 // is that we understate the true minimum length
3285 currentLen
= forwardedLength
.elementAti(loc
+1);
3287 // Forward jump. Propagate the current min length to the target loc of the jump.
3288 U_ASSERT(jmpDest
<= end
+1);
3289 if (forwardedLength
.elementAti(jmpDest
) > currentLen
) {
3290 forwardedLength
.setElementAt(currentLen
, jmpDest
);
3298 // Back-tracks are kind of like a branch, except that the min length was
3299 // propagated already, by the state save.
3300 currentLen
= forwardedLength
.elementAti(loc
+1);
3305 case URX_STATE_SAVE
:
3307 // State Save, for forward jumps, propagate the current minimum.
3308 // of the state save.
3309 int32_t jmpDest
= URX_VAL(op
);
3310 if (jmpDest
> loc
) {
3311 if (currentLen
< forwardedLength
.elementAti(jmpDest
)) {
3312 forwardedLength
.setElementAt(currentLen
, jmpDest
);
3322 int32_t stringLenOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
3323 currentLen
= safeIncrement(currentLen
, URX_VAL(stringLenOp
));
3331 // TODO: with full case folding, matching input text may be shorter than
3332 // the string we have here. More smarts could put some bounds on it.
3333 // Assume a min length of one for now. A min length of zero causes
3334 // optimization failures for a pattern like "string"+
3335 // currentLen += URX_VAL(stringLenOp);
3336 currentLen
= safeIncrement(currentLen
, 1);
3341 case URX_CTR_INIT_NG
:
3344 // If the min loop count == 0
3345 // move loc forwards to the end of the loop, skipping over the body.
3346 // If the min count is > 0,
3347 // continue normal processing of the body of the loop.
3348 int32_t loopEndLoc
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
+1);
3349 loopEndLoc
= URX_VAL(loopEndLoc
);
3350 int32_t minLoopCount
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
+2);
3351 if (minLoopCount
== 0) {
3354 loc
+=3; // Skips over operands of CTR_INIT
3361 case URX_CTR_LOOP_NG
:
3363 // The jump is conditional, backwards only.
3367 case URX_LOOP_DOT_I
:
3369 // More loop ops. These state-save to themselves.
3370 // don't change the minimum match - could match nothing at all.
3377 // Look-around. Scan forward until the matching look-ahead end,
3378 // without processing the look-around block. This is overly pessimistic for look-ahead,
3379 // it assumes that the look-ahead match might be zero-length.
3380 // TODO: Positive lookahead could recursively do the block, then continue
3381 // with the longer of the block or the value coming in. Ticket 6060
3382 int32_t depth
= (opType
== URX_LA_START
? 2: 1);;
3385 op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
3386 if (URX_TYPE(op
) == URX_LA_START
) {
3387 // The boilerplate for look-ahead includes two LA_END insturctions,
3388 // Depth will be decremented by each one when it is seen.
3391 if (URX_TYPE(op
) == URX_LB_START
) {
3394 if (URX_TYPE(op
) == URX_LA_END
) {
3400 if (URX_TYPE(op
)==URX_LBN_END
) {
3406 if (URX_TYPE(op
) == URX_STATE_SAVE
) {
3407 // Need this because neg lookahead blocks will FAIL to outside
3409 int32_t jmpDest
= URX_VAL(op
);
3410 if (jmpDest
> loc
) {
3411 if (currentLen
< forwardedLength
.elementAti(jmpDest
)) {
3412 forwardedLength
.setElementAt(currentLen
, jmpDest
);
3416 U_ASSERT(loc
<= end
);
3426 // Only come here if the matching URX_LA_START or URX_LB_START was not in the
3427 // range being sized, which happens when measuring size of look-behind blocks.
3436 // We have finished walking through the ops. Check whether some forward jump
3437 // propagated a shorter length to location end+1.
3438 if (forwardedLength
.elementAti(end
+1) < currentLen
) {
3439 currentLen
= forwardedLength
.elementAti(end
+1);
3440 U_ASSERT(currentLen
>=0 && currentLen
< INT32_MAX
);
3446 //------------------------------------------------------------------------------
3448 // maxMatchLength Calculate the length of the longest string that could
3449 // match the specified pattern.
3450 // Length is in 16 bit code units, not code points.
3452 // The calculated length may not be exact. The returned
3453 // value may be longer than the actual maximum; it must
3454 // never be shorter.
3456 //------------------------------------------------------------------------------
3457 int32_t RegexCompile::maxMatchLength(int32_t start
, int32_t end
) {
3458 if (U_FAILURE(*fStatus
)) {
3461 U_ASSERT(start
<= end
);
3462 U_ASSERT(end
< fRXPat
->fCompiledPat
->size());
3468 int32_t currentLen
= 0;
3469 UVector32
forwardedLength(end
+1, *fStatus
);
3470 forwardedLength
.setSize(end
+1);
3472 for (loc
=start
; loc
<=end
; loc
++) {
3473 forwardedLength
.setElementAt(0, loc
);
3476 for (loc
= start
; loc
<=end
; loc
++) {
3477 op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
3478 opType
= URX_TYPE(op
);
3480 // The loop is advancing linearly through the pattern.
3481 // If the op we are now at was the destination of a branch in the pattern,
3482 // and that path has a longer maximum length than the current accumulated value,
3483 // replace the current accumulated value.
3484 if (forwardedLength
.elementAti(loc
) > currentLen
) {
3485 currentLen
= forwardedLength
.elementAti(loc
);
3489 // Ops that don't change the total length matched
3490 case URX_RESERVED_OP
:
3492 case URX_STRING_LEN
:
3494 case URX_START_CAPTURE
:
3495 case URX_END_CAPTURE
:
3496 case URX_BACKSLASH_B
:
3497 case URX_BACKSLASH_BU
:
3498 case URX_BACKSLASH_G
:
3499 case URX_BACKSLASH_Z
:
3505 case URX_RELOC_OPRND
:
3506 case URX_STO_INP_LOC
:
3508 case URX_CARET_M_UNIX
:
3510 case URX_STO_SP
: // Setup for atomic or possessive blocks. Doesn't change what can match.
3520 // Ops that increase that cause an unbounded increase in the length
3521 // of a matched string, or that increase it a hard to characterize way.
3522 // Call the max length unbounded, and stop further checking.
3523 case URX_BACKREF
: // BackRef. Must assume that it might be a zero length match
3525 case URX_BACKSLASH_X
: // Grahpeme Cluster. Minimum is 1, max unbounded.
3526 currentLen
= INT32_MAX
;
3530 // Ops that match a max of one character (possibly two 16 bit code units.)
3532 case URX_STATIC_SETREF
:
3533 case URX_STAT_SETREF_N
:
3535 case URX_BACKSLASH_D
:
3536 case URX_BACKSLASH_H
:
3537 case URX_BACKSLASH_R
:
3538 case URX_BACKSLASH_V
:
3540 case URX_DOTANY_ALL
:
3542 case URX_DOTANY_UNIX
:
3543 currentLen
= safeIncrement(currentLen
, 2);
3546 // Single literal character. Increase current max length by one or two,
3547 // depending on whether the char is in the supplementary range.
3549 currentLen
= safeIncrement(currentLen
, 1);
3550 if (URX_VAL(op
) > 0x10000) {
3551 currentLen
= safeIncrement(currentLen
, 1);
3562 int32_t jmpDest
= URX_VAL(op
);
3563 if (jmpDest
< loc
) {
3564 // Loop of some kind. Max match length is unbounded.
3565 currentLen
= INT32_MAX
;
3567 // Forward jump. Propagate the current min length to the target loc of the jump.
3568 if (forwardedLength
.elementAti(jmpDest
) < currentLen
) {
3569 forwardedLength
.setElementAt(currentLen
, jmpDest
);
3577 // back-tracks are kind of like a branch, except that the max length was
3578 // propagated already, by the state save.
3579 currentLen
= forwardedLength
.elementAti(loc
+1);
3583 case URX_STATE_SAVE
:
3585 // State Save, for forward jumps, propagate the current minimum.
3586 // of the state save.
3587 // For backwards jumps, they create a loop, maximum
3588 // match length is unbounded.
3589 int32_t jmpDest
= URX_VAL(op
);
3590 if (jmpDest
> loc
) {
3591 if (currentLen
> forwardedLength
.elementAti(jmpDest
)) {
3592 forwardedLength
.setElementAt(currentLen
, jmpDest
);
3595 currentLen
= INT32_MAX
;
3606 int32_t stringLenOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
3607 currentLen
= safeIncrement(currentLen
, URX_VAL(stringLenOp
));
3612 // TODO: This code assumes that any user string that matches will be no longer
3613 // than our compiled string, with case insensitive matching.
3614 // Our compiled string has been case-folded already.
3616 // Any matching user string will have no more code points than our
3617 // compiled (folded) string. Folding may add code points, but
3620 // There is a potential problem if a supplemental code point
3621 // case-folds to a BMP code point. In this case our compiled string
3622 // could be shorter (in code units) than a matching user string.
3624 // At this time (Unicode 6.1) there are no such characters, and this case
3625 // is not being handled. A test, intltest regex/Bug9283, will fail if
3626 // any problematic characters are added to Unicode.
3628 // If this happens, we can make a set of the BMP chars that the
3629 // troublesome supplementals fold to, scan our string, and bump the
3630 // currentLen one extra for each that is found.
3634 int32_t stringLenOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
3635 currentLen
= safeIncrement(currentLen
, URX_VAL(stringLenOp
));
3640 case URX_CTR_INIT_NG
:
3641 // For Loops, recursively call this function on the pattern for the loop body,
3642 // then multiply the result by the maximum loop count.
3644 int32_t loopEndLoc
= URX_VAL(fRXPat
->fCompiledPat
->elementAti(loc
+1));
3645 if (loopEndLoc
== loc
+4) {
3646 // Loop has an empty body. No affect on max match length.
3647 // Continue processing with code after the loop end.
3652 int32_t maxLoopCount
= static_cast<int32_t>(fRXPat
->fCompiledPat
->elementAti(loc
+3));
3653 if (maxLoopCount
== -1) {
3654 // Unbounded Loop. No upper bound on match length.
3655 currentLen
= INT32_MAX
;
3659 U_ASSERT(loopEndLoc
>= loc
+4);
3660 int64_t blockLen
= maxMatchLength(loc
+4, loopEndLoc
-1); // Recursive call.
3661 int64_t updatedLen
= (int64_t)currentLen
+ blockLen
* maxLoopCount
;
3662 if (updatedLen
>= INT32_MAX
) {
3663 currentLen
= INT32_MAX
;
3666 currentLen
= (int32_t)updatedLen
;
3672 case URX_CTR_LOOP_NG
:
3673 // These opcodes will be skipped over by code for URX_CRT_INIT.
3674 // We shouldn't encounter them here.
3679 case URX_LOOP_DOT_I
:
3681 // For anything to do with loops, make the match length unbounded.
3682 currentLen
= INT32_MAX
;
3689 // Look-ahead. Just ignore, treat the look-ahead block as if
3690 // it were normal pattern. Gives a too-long match length,
3691 // but good enough for now.
3694 // End of look-ahead ops should always be consumed by the processing at
3695 // the URX_LA_START op.
3701 // Look-behind. Scan forward until the matching look-around end,
3702 // without processing the look-behind block.
3706 op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
3707 if (URX_TYPE(op
) == URX_LA_START
|| URX_TYPE(op
) == URX_LB_START
) {
3710 if (URX_TYPE(op
) == URX_LA_END
|| URX_TYPE(op
)==URX_LBN_END
) {
3716 U_ASSERT(loc
< end
);
3726 if (currentLen
== INT32_MAX
) {
3727 // The maximum length is unbounded.
3728 // Stop further processing of the pattern.
3738 //------------------------------------------------------------------------------
3740 // stripNOPs Remove any NOP operations from the compiled pattern code.
3741 // Extra NOPs are inserted for some constructs during the initial
3742 // code generation to provide locations that may be patched later.
3743 // Many end up unneeded, and are removed by this function.
3745 // In order to minimize the number of passes through the pattern,
3746 // back-reference fixup is also performed here (adjusting
3747 // back-reference operands to point to the correct frame offsets).
3749 //------------------------------------------------------------------------------
3750 void RegexCompile::stripNOPs() {
3752 if (U_FAILURE(*fStatus
)) {
3756 int32_t end
= fRXPat
->fCompiledPat
->size();
3757 UVector32
deltas(end
, *fStatus
);
3759 // Make a first pass over the code, computing the amount that things
3760 // will be offset at each location in the original code.
3763 for (loc
=0; loc
<end
; loc
++) {
3764 deltas
.addElement(d
, *fStatus
);
3765 int32_t op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
3766 if (URX_TYPE(op
) == URX_NOP
) {
3771 UnicodeString caseStringBuffer
;
3773 // Make a second pass over the code, removing the NOPs by moving following
3774 // code up, and patching operands that refer to code locations that
3775 // are being moved. The array of offsets from the first step is used
3776 // to compute the new operand values.
3779 for (src
=0; src
<end
; src
++) {
3780 int32_t op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(src
);
3781 int32_t opType
= URX_TYPE(op
);
3786 case URX_STATE_SAVE
:
3789 case URX_CTR_LOOP_NG
:
3790 case URX_RELOC_OPRND
:
3794 // These are instructions with operands that refer to code locations.
3796 int32_t operandAddress
= URX_VAL(op
);
3797 U_ASSERT(operandAddress
>=0 && operandAddress
<deltas
.size());
3798 int32_t fixedOperandAddress
= operandAddress
- deltas
.elementAti(operandAddress
);
3799 op
= buildOp(opType
, fixedOperandAddress
);
3800 fRXPat
->fCompiledPat
->setElementAt(op
, dst
);
3808 int32_t where
= URX_VAL(op
);
3809 if (where
> fRXPat
->fGroupMap
->size()) {
3810 error(U_REGEX_INVALID_BACK_REF
);
3813 where
= fRXPat
->fGroupMap
->elementAti(where
-1);
3814 op
= buildOp(opType
, where
);
3815 fRXPat
->fCompiledPat
->setElementAt(op
, dst
);
3818 fRXPat
->fNeedsAltInput
= TRUE
;
3821 case URX_RESERVED_OP
:
3822 case URX_RESERVED_OP_N
:
3827 case URX_STRING_LEN
:
3828 case URX_START_CAPTURE
:
3829 case URX_END_CAPTURE
:
3830 case URX_STATIC_SETREF
:
3831 case URX_STAT_SETREF_N
:
3835 case URX_BACKSLASH_B
:
3836 case URX_BACKSLASH_BU
:
3837 case URX_BACKSLASH_G
:
3838 case URX_BACKSLASH_X
:
3839 case URX_BACKSLASH_Z
:
3840 case URX_DOTANY_ALL
:
3841 case URX_BACKSLASH_D
:
3845 case URX_CTR_INIT_NG
:
3846 case URX_DOTANY_UNIX
:
3849 case URX_STO_INP_LOC
:
3856 case URX_CARET_M_UNIX
:
3863 case URX_LOOP_DOT_I
:
3867 case URX_BACKSLASH_H
:
3868 case URX_BACKSLASH_R
:
3869 case URX_BACKSLASH_V
:
3870 // These instructions are unaltered by the relocation.
3871 fRXPat
->fCompiledPat
->setElementAt(op
, dst
);
3876 // Some op is unaccounted for.
3878 error(U_REGEX_INTERNAL_ERROR
);
3882 fRXPat
->fCompiledPat
->setSize(dst
);
3888 //------------------------------------------------------------------------------
3890 // Error Report a rule parse error.
3891 // Only report it if no previous error has been recorded.
3893 //------------------------------------------------------------------------------
3894 void RegexCompile::error(UErrorCode e
) {
3895 if (U_SUCCESS(*fStatus
)) {
3897 // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
3898 // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
3899 // int64_t. If the values of the latter are out of range for the former,
3900 // set them to the appropriate "field not supported" values.
3901 if (fLineNum
> 0x7FFFFFFF) {
3902 fParseErr
->line
= 0;
3903 fParseErr
->offset
= -1;
3904 } else if (fCharNum
> 0x7FFFFFFF) {
3905 fParseErr
->line
= (int32_t)fLineNum
;
3906 fParseErr
->offset
= -1;
3908 fParseErr
->line
= (int32_t)fLineNum
;
3909 fParseErr
->offset
= (int32_t)fCharNum
;
3912 UErrorCode status
= U_ZERO_ERROR
; // throwaway status for extracting context
3914 // Fill in the context.
3915 // Note: extractBetween() pins supplied indicies to the string bounds.
3916 uprv_memset(fParseErr
->preContext
, 0, sizeof(fParseErr
->preContext
));
3917 uprv_memset(fParseErr
->postContext
, 0, sizeof(fParseErr
->postContext
));
3918 utext_extract(fRXPat
->fPattern
, fScanIndex
-U_PARSE_CONTEXT_LEN
+1, fScanIndex
, fParseErr
->preContext
, U_PARSE_CONTEXT_LEN
, &status
);
3919 utext_extract(fRXPat
->fPattern
, fScanIndex
, fScanIndex
+U_PARSE_CONTEXT_LEN
-1, fParseErr
->postContext
, U_PARSE_CONTEXT_LEN
, &status
);
3925 // Assorted Unicode character constants.
3926 // Numeric because there is no portable way to enter them as literals.
3929 static const UChar chCR
= 0x0d; // New lines, for terminating comments.
3930 static const UChar chLF
= 0x0a; // Line Feed
3931 static const UChar chPound
= 0x23; // '#', introduces a comment.
3932 static const UChar chDigit0
= 0x30; // '0'
3933 static const UChar chDigit7
= 0x37; // '9'
3934 static const UChar chColon
= 0x3A; // ':'
3935 static const UChar chE
= 0x45; // 'E'
3936 static const UChar chQ
= 0x51; // 'Q'
3937 //static const UChar chN = 0x4E; // 'N'
3938 static const UChar chP
= 0x50; // 'P'
3939 static const UChar chBackSlash
= 0x5c; // '\' introduces a char escape
3940 //static const UChar chLBracket = 0x5b; // '['
3941 static const UChar chRBracket
= 0x5d; // ']'
3942 static const UChar chUp
= 0x5e; // '^'
3943 static const UChar chLowerP
= 0x70;
3944 static const UChar chLBrace
= 0x7b; // '{'
3945 static const UChar chRBrace
= 0x7d; // '}'
3946 static const UChar chNEL
= 0x85; // NEL newline variant
3947 static const UChar chLS
= 0x2028; // Unicode Line Separator
3950 //------------------------------------------------------------------------------
3952 // nextCharLL Low Level Next Char from the regex pattern.
3953 // Get a char from the string, keep track of input position
3954 // for error reporting.
3956 //------------------------------------------------------------------------------
3957 UChar32
RegexCompile::nextCharLL() {
3960 if (fPeekChar
!= -1) {
3966 // assume we're already in the right place
3967 ch
= UTEXT_NEXT32(fRXPat
->fPattern
);
3968 if (ch
== U_SENTINEL
) {
3975 (ch
== chLF
&& fLastChar
!= chCR
)) {
3976 // Character is starting a new line. Bump up the line number, and
3977 // reset the column to 0.
3982 // Character is not starting a new line. Except in the case of a
3983 // LF following a CR, increment the column position.
3992 //------------------------------------------------------------------------------
3994 // peekCharLL Low Level Character Scanning, sneak a peek at the next
3995 // character without actually getting it.
3997 //------------------------------------------------------------------------------
3998 UChar32
RegexCompile::peekCharLL() {
3999 if (fPeekChar
== -1) {
4000 fPeekChar
= nextCharLL();
4006 //------------------------------------------------------------------------------
4008 // nextChar for pattern scanning. At this level, we handle stripping
4009 // out comments and processing some backslash character escapes.
4010 // The rest of the pattern grammar is handled at the next level up.
4012 //------------------------------------------------------------------------------
4013 void RegexCompile::nextChar(RegexPatternChar
&c
) {
4015 fScanIndex
= UTEXT_GETNATIVEINDEX(fRXPat
->fPattern
);
4016 c
.fChar
= nextCharLL();
4021 if ((c
.fChar
==chBackSlash
&& peekCharLL()==chE
&& ((fModeFlags
& UREGEX_LITERAL
) == 0)) ||
4022 c
.fChar
== (UChar32
)-1) {
4023 fQuoteMode
= FALSE
; // Exit quote mode,
4024 nextCharLL(); // discard the E
4025 nextChar(c
); // recurse to get the real next char
4028 else if (fInBackslashQuote
) {
4029 // The current character immediately follows a '\'
4030 // Don't check for any further escapes, just return it as-is.
4031 // Don't set c.fQuoted, because that would prevent the state machine from
4032 // dispatching on the character.
4033 fInBackslashQuote
= FALSE
;
4037 // We are not in a \Q quoted region \E of the source.
4039 if (fModeFlags
& UREGEX_COMMENTS
) {
4041 // We are in free-spacing and comments mode.
4042 // Scan through any white space and comments, until we
4043 // reach a significant character or the end of inut.
4045 if (c
.fChar
== (UChar32
)-1) {
4046 break; // End of Input
4048 if (c
.fChar
== chPound
&& fEOLComments
== TRUE
) {
4049 // Start of a comment. Consume the rest of it, until EOF or a new line
4051 c
.fChar
= nextCharLL();
4052 if (c
.fChar
== (UChar32
)-1 || // EOF
4061 // TODO: check what Java & Perl do with non-ASCII white spaces. Ticket 6061.
4062 if (PatternProps::isWhiteSpace(c
.fChar
) == FALSE
) {
4065 c
.fChar
= nextCharLL();
4070 // check for backslash escaped characters.
4072 if (c
.fChar
== chBackSlash
) {
4073 int64_t pos
= UTEXT_GETNATIVEINDEX(fRXPat
->fPattern
);
4074 if (RegexStaticSets::gStaticSets
->fUnescapeCharSet
.contains(peekCharLL())) {
4076 // A '\' sequence that is handled by ICU's standard unescapeAt function.
4077 // Includes \uxxxx, \n, \r, many others.
4078 // Return the single equivalent character.
4080 nextCharLL(); // get & discard the peeked char.
4083 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat
->fPattern
, fPatternLength
)) {
4084 int32_t endIndex
= (int32_t)pos
;
4085 c
.fChar
= u_unescapeAt(uregex_ucstr_unescape_charAt
, &endIndex
, (int32_t)fPatternLength
, (void *)fRXPat
->fPattern
->chunkContents
);
4087 if (endIndex
== pos
) {
4088 error(U_REGEX_BAD_ESCAPE_SEQUENCE
);
4090 fCharNum
+= endIndex
- pos
;
4091 UTEXT_SETNATIVEINDEX(fRXPat
->fPattern
, endIndex
);
4094 struct URegexUTextUnescapeCharContext context
= U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat
->fPattern
);
4096 UTEXT_SETNATIVEINDEX(fRXPat
->fPattern
, pos
);
4097 c
.fChar
= u_unescapeAt(uregex_utext_unescape_charAt
, &offset
, INT32_MAX
, &context
);
4100 error(U_REGEX_BAD_ESCAPE_SEQUENCE
);
4101 } else if (context
.lastOffset
== offset
) {
4102 UTEXT_PREVIOUS32(fRXPat
->fPattern
);
4103 } else if (context
.lastOffset
!= offset
-1) {
4104 utext_moveIndex32(fRXPat
->fPattern
, offset
- context
.lastOffset
- 1);
4109 else if (peekCharLL() == chDigit0
) {
4110 // Octal Escape, using Java Regexp Conventions
4111 // which are \0 followed by 1-3 octal digits.
4112 // Different from ICU Unescape handling of Octal, which does not
4113 // require the leading 0.
4114 // Java also has the convention of only consuming 2 octal digits if
4115 // the three digit number would be > 0xff
4118 nextCharLL(); // Consume the initial 0.
4120 for (index
=0; index
<3; index
++) {
4121 int32_t ch
= peekCharLL();
4122 if (ch
<chDigit0
|| ch
>chDigit7
) {
4124 // \0 is not followed by any octal digits.
4125 error(U_REGEX_BAD_ESCAPE_SEQUENCE
);
4131 if (c
.fChar
<= 255) {
4134 // The last digit made the number too big. Forget we saw it.
4140 else if (peekCharLL() == chQ
) {
4141 // "\Q" enter quote mode, which will continue until "\E"
4143 nextCharLL(); // discard the 'Q'.
4144 nextChar(c
); // recurse to get the real next char.
4148 // We are in a '\' escape that will be handled by the state table scanner.
4149 // Just return the backslash, but remember that the following char is to
4150 // be taken literally.
4151 fInBackslashQuote
= TRUE
;
4156 // re-enable # to end-of-line comments, in case they were disabled.
4157 // They are disabled by the parser upon seeing '(?', but this lasts for
4158 // the fetching of the next character only.
4159 fEOLComments
= TRUE
;
4161 // putc(c.fChar, stdout);
4166 //------------------------------------------------------------------------------
4169 // Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
4171 // The scan position will be at the 'N'. On return
4172 // the scan position should be just after the '}'
4174 // Return the UChar32
4176 //------------------------------------------------------------------------------
4177 UChar32
RegexCompile::scanNamedChar() {
4178 if (U_FAILURE(*fStatus
)) {
4183 if (fC
.fChar
!= chLBrace
) {
4184 error(U_REGEX_PROPERTY_SYNTAX
);
4188 UnicodeString charName
;
4191 if (fC
.fChar
== chRBrace
) {
4194 if (fC
.fChar
== -1) {
4195 error(U_REGEX_PROPERTY_SYNTAX
);
4198 charName
.append(fC
.fChar
);
4202 if (!uprv_isInvariantUString(charName
.getBuffer(), charName
.length()) ||
4203 (uint32_t)charName
.length()>=sizeof(name
)) {
4204 // All Unicode character names have only invariant characters.
4205 // The API to get a character, given a name, accepts only char *, forcing us to convert,
4206 // which requires this error check
4207 error(U_REGEX_PROPERTY_SYNTAX
);
4210 charName
.extract(0, charName
.length(), name
, sizeof(name
), US_INV
);
4212 UChar32 theChar
= u_charFromName(U_UNICODE_CHAR_NAME
, name
, fStatus
);
4213 if (U_FAILURE(*fStatus
)) {
4214 error(U_REGEX_PROPERTY_SYNTAX
);
4217 nextChar(fC
); // Continue overall regex pattern processing with char after the '}'
4221 //------------------------------------------------------------------------------
4223 // scanProp Construct a UnicodeSet from the text at the current scan
4224 // position, which will be of the form \p{whaterver}
4226 // The scan position will be at the 'p' or 'P'. On return
4227 // the scan position should be just after the '}'
4229 // Return a UnicodeSet, constructed from the \P pattern,
4230 // or NULL if the pattern is invalid.
4232 //------------------------------------------------------------------------------
4233 UnicodeSet
*RegexCompile::scanProp() {
4234 UnicodeSet
*uset
= NULL
;
4236 if (U_FAILURE(*fStatus
)) {
4239 (void)chLowerP
; // Suppress compiler unused variable warning.
4240 U_ASSERT(fC
.fChar
== chLowerP
|| fC
.fChar
== chP
);
4241 UBool negated
= (fC
.fChar
== chP
);
4243 UnicodeString propertyName
;
4245 if (fC
.fChar
!= chLBrace
) {
4246 error(U_REGEX_PROPERTY_SYNTAX
);
4251 if (fC
.fChar
== chRBrace
) {
4254 if (fC
.fChar
== -1) {
4255 // Hit the end of the input string without finding the closing '}'
4256 error(U_REGEX_PROPERTY_SYNTAX
);
4259 propertyName
.append(fC
.fChar
);
4261 uset
= createSetForProperty(propertyName
, negated
);
4262 nextChar(fC
); // Move input scan to position following the closing '}'
4266 //------------------------------------------------------------------------------
4268 // scanPosixProp Construct a UnicodeSet from the text at the current scan
4269 // position, which is expected be of the form [:property expression:]
4271 // The scan position will be at the opening ':'. On return
4272 // the scan position must be on the closing ']'
4274 // Return a UnicodeSet constructed from the pattern,
4275 // or NULL if this is not a valid POSIX-style set expression.
4276 // If not a property expression, restore the initial scan position
4277 // (to the opening ':')
4279 // Note: the opening '[:' is not sufficient to guarantee that
4280 // this is a [:property:] expression.
4281 // [:'+=,] is a perfectly good ordinary set expression that
4282 // happens to include ':' as one of its characters.
4284 //------------------------------------------------------------------------------
4285 UnicodeSet
*RegexCompile::scanPosixProp() {
4286 UnicodeSet
*uset
= NULL
;
4288 if (U_FAILURE(*fStatus
)) {
4292 U_ASSERT(fC
.fChar
== chColon
);
4294 // Save the scanner state.
4295 // TODO: move this into the scanner, with the state encapsulated in some way. Ticket 6062
4296 int64_t savedScanIndex
= fScanIndex
;
4297 int64_t savedNextIndex
= UTEXT_GETNATIVEINDEX(fRXPat
->fPattern
);
4298 UBool savedQuoteMode
= fQuoteMode
;
4299 UBool savedInBackslashQuote
= fInBackslashQuote
;
4300 UBool savedEOLComments
= fEOLComments
;
4301 int64_t savedLineNum
= fLineNum
;
4302 int64_t savedCharNum
= fCharNum
;
4303 UChar32 savedLastChar
= fLastChar
;
4304 UChar32 savedPeekChar
= fPeekChar
;
4305 RegexPatternChar savedfC
= fC
;
4307 // Scan for a closing ]. A little tricky because there are some perverse
4308 // edge cases possible. "[:abc\Qdef:] \E]" is a valid non-property expression,
4309 // ending on the second closing ].
4311 UnicodeString propName
;
4312 UBool negated
= FALSE
;
4314 // Check for and consume the '^' in a negated POSIX property, e.g. [:^Letter:]
4316 if (fC
.fChar
== chUp
) {
4321 // Scan for the closing ":]", collecting the property name along the way.
4322 UBool sawPropSetTerminator
= FALSE
;
4324 propName
.append(fC
.fChar
);
4326 if (fC
.fQuoted
|| fC
.fChar
== -1) {
4327 // Escaped characters or end of input - either says this isn't a [:Property:]
4330 if (fC
.fChar
== chColon
) {
4332 if (fC
.fChar
== chRBracket
) {
4333 sawPropSetTerminator
= TRUE
;
4339 if (sawPropSetTerminator
) {
4340 uset
= createSetForProperty(propName
, negated
);
4345 // Restore the original scan position.
4346 // The main scanner will retry the input as a normal set expression,
4347 // not a [:Property:] expression.
4348 fScanIndex
= savedScanIndex
;
4349 fQuoteMode
= savedQuoteMode
;
4350 fInBackslashQuote
= savedInBackslashQuote
;
4351 fEOLComments
= savedEOLComments
;
4352 fLineNum
= savedLineNum
;
4353 fCharNum
= savedCharNum
;
4354 fLastChar
= savedLastChar
;
4355 fPeekChar
= savedPeekChar
;
4357 UTEXT_SETNATIVEINDEX(fRXPat
->fPattern
, savedNextIndex
);
4362 static inline void addIdentifierIgnorable(UnicodeSet
*set
, UErrorCode
& ec
) {
4363 set
->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
4364 addCategory(set
, U_GC_CF_MASK
, ec
);
4368 // Create a Unicode Set from a Unicode Property expression.
4369 // This is common code underlying both \p{...} ane [:...:] expressions.
4370 // Includes trying the Java "properties" that aren't supported as
4371 // normal ICU UnicodeSet properties
4373 static const UChar posSetPrefix
[] = {0x5b, 0x5c, 0x70, 0x7b, 0}; // "[\p{"
4374 static const UChar negSetPrefix
[] = {0x5b, 0x5c, 0x50, 0x7b, 0}; // "[\P{"
4375 UnicodeSet
*RegexCompile::createSetForProperty(const UnicodeString
&propName
, UBool negated
) {
4376 UnicodeString setExpr
;
4378 uint32_t usetFlags
= 0;
4380 if (U_FAILURE(*fStatus
)) {
4385 // First try the property as we received it
4388 setExpr
.append(negSetPrefix
, -1);
4390 setExpr
.append(posSetPrefix
, -1);
4392 setExpr
.append(propName
);
4393 setExpr
.append(chRBrace
);
4394 setExpr
.append(chRBracket
);
4395 if (fModeFlags
& UREGEX_CASE_INSENSITIVE
) {
4396 usetFlags
|= USET_CASE_INSENSITIVE
;
4398 set
= new UnicodeSet(setExpr
, usetFlags
, NULL
, *fStatus
);
4399 if (U_SUCCESS(*fStatus
)) {
4406 // The property as it was didn't work.
4408 // Do [:word:]. It is not recognized as a property by UnicodeSet. "word" not standard POSIX
4409 // or standard Java, but many other regular expression packages do recognize it.
4411 if (propName
.caseCompare(UNICODE_STRING_SIMPLE("word"), 0) == 0) {
4412 *fStatus
= U_ZERO_ERROR
;
4413 set
= new UnicodeSet(*(fRXPat
->fStaticSets
[URX_ISWORD_SET
]));
4415 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
4426 // InGreek -> InGreek or Coptic, that being the official Unicode name for that block.
4427 // InCombiningMarksforSymbols -> InCombiningDiacriticalMarksforSymbols.
4429 // Note on Spaces: either "InCombiningMarksForSymbols" or "InCombining Marks for Symbols"
4430 // is accepted by Java. The property part of the name is compared
4431 // case-insenstively. The spaces must be exactly as shown, either
4432 // all there, or all omitted, with exactly one at each position
4433 // if they are present. From checking against JDK 1.6
4435 // This code should be removed when ICU properties support the Java compatibility names
4438 UnicodeString mPropName
= propName
;
4439 if (mPropName
.caseCompare(UNICODE_STRING_SIMPLE("InGreek"), 0) == 0) {
4440 mPropName
= UNICODE_STRING_SIMPLE("InGreek and Coptic");
4442 if (mPropName
.caseCompare(UNICODE_STRING_SIMPLE("InCombining Marks for Symbols"), 0) == 0 ||
4443 mPropName
.caseCompare(UNICODE_STRING_SIMPLE("InCombiningMarksforSymbols"), 0) == 0) {
4444 mPropName
= UNICODE_STRING_SIMPLE("InCombining Diacritical Marks for Symbols");
4446 else if (mPropName
.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
4447 mPropName
= UNICODE_STRING_SIMPLE("javaValidCodePoint");
4450 // See if the property looks like a Java "InBlockName", which
4451 // we will recast as "Block=BlockName"
4453 if (mPropName
.startsWith(u
"In", 2) && propName
.length()>=3) {
4454 setExpr
.truncate(4); // Leaves "[\p{", or "[\P{"
4455 setExpr
.append(u
"Block=", -1);
4456 setExpr
.append(UnicodeString(mPropName
, 2)); // Property with the leading "In" removed.
4457 setExpr
.append(chRBrace
);
4458 setExpr
.append(chRBracket
);
4459 *fStatus
= U_ZERO_ERROR
;
4460 set
= new UnicodeSet(setExpr
, usetFlags
, NULL
, *fStatus
);
4461 if (U_SUCCESS(*fStatus
)) {
4468 if (propName
.startsWith(UNICODE_STRING_SIMPLE("java")) ||
4469 propName
.compare(UNICODE_STRING_SIMPLE("all")) == 0)
4471 UErrorCode localStatus
= U_ZERO_ERROR
;
4473 set
= new UnicodeSet();
4475 // Try the various Java specific properties.
4476 // These all begin with "java"
4478 if (mPropName
.compare(UNICODE_STRING_SIMPLE("javaDefined")) == 0) {
4479 addCategory(set
, U_GC_CN_MASK
, localStatus
);
4482 else if (mPropName
.compare(UNICODE_STRING_SIMPLE("javaDigit")) == 0) {
4483 addCategory(set
, U_GC_ND_MASK
, localStatus
);
4485 else if (mPropName
.compare(UNICODE_STRING_SIMPLE("javaIdentifierIgnorable")) == 0) {
4486 addIdentifierIgnorable(set
, localStatus
);
4488 else if (mPropName
.compare(UNICODE_STRING_SIMPLE("javaISOControl")) == 0) {
4489 set
->add(0, 0x1F).add(0x7F, 0x9F);
4491 else if (mPropName
.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierPart")) == 0) {
4492 addCategory(set
, U_GC_L_MASK
, localStatus
);
4493 addCategory(set
, U_GC_SC_MASK
, localStatus
);
4494 addCategory(set
, U_GC_PC_MASK
, localStatus
);
4495 addCategory(set
, U_GC_ND_MASK
, localStatus
);
4496 addCategory(set
, U_GC_NL_MASK
, localStatus
);
4497 addCategory(set
, U_GC_MC_MASK
, localStatus
);
4498 addCategory(set
, U_GC_MN_MASK
, localStatus
);
4499 addIdentifierIgnorable(set
, localStatus
);
4501 else if (mPropName
.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierStart")) == 0) {
4502 addCategory(set
, U_GC_L_MASK
, localStatus
);
4503 addCategory(set
, U_GC_NL_MASK
, localStatus
);
4504 addCategory(set
, U_GC_SC_MASK
, localStatus
);
4505 addCategory(set
, U_GC_PC_MASK
, localStatus
);
4507 else if (mPropName
.compare(UNICODE_STRING_SIMPLE("javaLetter")) == 0) {
4508 addCategory(set
, U_GC_L_MASK
, localStatus
);
4510 else if (mPropName
.compare(UNICODE_STRING_SIMPLE("javaLetterOrDigit")) == 0) {
4511 addCategory(set
, U_GC_L_MASK
, localStatus
);
4512 addCategory(set
, U_GC_ND_MASK
, localStatus
);
4514 else if (mPropName
.compare(UNICODE_STRING_SIMPLE("javaLowerCase")) == 0) {
4515 addCategory(set
, U_GC_LL_MASK
, localStatus
);
4517 else if (mPropName
.compare(UNICODE_STRING_SIMPLE("javaMirrored")) == 0) {
4518 set
->applyIntPropertyValue(UCHAR_BIDI_MIRRORED
, 1, localStatus
);
4520 else if (mPropName
.compare(UNICODE_STRING_SIMPLE("javaSpaceChar")) == 0) {
4521 addCategory(set
, U_GC_Z_MASK
, localStatus
);
4523 else if (mPropName
.compare(UNICODE_STRING_SIMPLE("javaSupplementaryCodePoint")) == 0) {
4524 set
->add(0x10000, UnicodeSet::MAX_VALUE
);
4526 else if (mPropName
.compare(UNICODE_STRING_SIMPLE("javaTitleCase")) == 0) {
4527 addCategory(set
, U_GC_LT_MASK
, localStatus
);
4529 else if (mPropName
.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierStart")) == 0) {
4530 addCategory(set
, U_GC_L_MASK
, localStatus
);
4531 addCategory(set
, U_GC_NL_MASK
, localStatus
);
4533 else if (mPropName
.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierPart")) == 0) {
4534 addCategory(set
, U_GC_L_MASK
, localStatus
);
4535 addCategory(set
, U_GC_PC_MASK
, localStatus
);
4536 addCategory(set
, U_GC_ND_MASK
, localStatus
);
4537 addCategory(set
, U_GC_NL_MASK
, localStatus
);
4538 addCategory(set
, U_GC_MC_MASK
, localStatus
);
4539 addCategory(set
, U_GC_MN_MASK
, localStatus
);
4540 addIdentifierIgnorable(set
, localStatus
);
4542 else if (mPropName
.compare(UNICODE_STRING_SIMPLE("javaUpperCase")) == 0) {
4543 addCategory(set
, U_GC_LU_MASK
, localStatus
);
4545 else if (mPropName
.compare(UNICODE_STRING_SIMPLE("javaValidCodePoint")) == 0) {
4546 set
->add(0, UnicodeSet::MAX_VALUE
);
4548 else if (mPropName
.compare(UNICODE_STRING_SIMPLE("javaWhitespace")) == 0) {
4549 addCategory(set
, U_GC_Z_MASK
, localStatus
);
4550 set
->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
4551 set
->add(9, 0x0d).add(0x1c, 0x1f);
4553 else if (mPropName
.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
4554 set
->add(0, UnicodeSet::MAX_VALUE
);
4557 if (U_SUCCESS(localStatus
) && !set
->isEmpty()) {
4558 *fStatus
= U_ZERO_ERROR
;
4559 if (usetFlags
& USET_CASE_INSENSITIVE
) {
4560 set
->closeOver(USET_CASE_INSENSITIVE
);
4577 // SetEval Part of the evaluation of [set expressions].
4578 // Perform any pending (stacked) operations with precedence
4579 // equal or greater to that of the next operator encountered
4580 // in the expression.
4582 void RegexCompile::setEval(int32_t nextOp
) {
4583 UnicodeSet
*rightOperand
= NULL
;
4584 UnicodeSet
*leftOperand
= NULL
;
4586 U_ASSERT(fSetOpStack
.empty()==FALSE
);
4587 int32_t pendingSetOperation
= fSetOpStack
.peeki();
4588 if ((pendingSetOperation
&0xffff0000) < (nextOp
&0xffff0000)) {
4592 U_ASSERT(fSetStack
.empty() == FALSE
);
4593 rightOperand
= (UnicodeSet
*)fSetStack
.peek();
4594 switch (pendingSetOperation
) {
4596 rightOperand
->complement();
4599 // TODO: need a simple close function. Ticket 6065
4600 rightOperand
->closeOver(USET_CASE_INSENSITIVE
);
4601 rightOperand
->removeAllStrings();
4603 case setDifference1
:
4604 case setDifference2
:
4606 leftOperand
= (UnicodeSet
*)fSetStack
.peek();
4607 leftOperand
->removeAll(*rightOperand
);
4608 delete rightOperand
;
4610 case setIntersection1
:
4611 case setIntersection2
:
4613 leftOperand
= (UnicodeSet
*)fSetStack
.peek();
4614 leftOperand
->retainAll(*rightOperand
);
4615 delete rightOperand
;
4619 leftOperand
= (UnicodeSet
*)fSetStack
.peek();
4620 leftOperand
->addAll(*rightOperand
);
4621 delete rightOperand
;
4630 void RegexCompile::setPushOp(int32_t op
) {
4632 fSetOpStack
.push(op
, *fStatus
);
4633 fSetStack
.push(new UnicodeSet(), *fStatus
);
4637 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS