4 // Copyright (C) 2002-2016 International Business Machines Corporation and others.
5 // All Rights Reserved.
7 // This file contains the ICU regular expression compiler, which is responsible
8 // for processing a regular expression pattern into the compiled form that
9 // is used by the match finding engine.
12 #include "unicode/utypes.h"
14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
16 #include "unicode/ustring.h"
17 #include "unicode/unistr.h"
18 #include "unicode/uniset.h"
19 #include "unicode/uchar.h"
20 #include "unicode/uchriter.h"
21 #include "unicode/parsepos.h"
22 #include "unicode/parseerr.h"
23 #include "unicode/regex.h"
24 #include "unicode/utf.h"
25 #include "unicode/utf16.h"
26 #include "patternprops.h"
36 #include "regexcst.h" // Contains state table for the regex pattern parser.
37 // generated by a Perl script.
47 //------------------------------------------------------------------------------
51 //------------------------------------------------------------------------------
52 RegexCompile::RegexCompile(RegexPattern
*rxp
, UErrorCode
&status
) :
53 fParenStack(status
), fSetStack(status
), fSetOpStack(status
)
55 // Lazy init of all shared global sets (needed for init()'s empty text)
56 RegexStaticSets::initGlobals(&status
);
67 fInBackslashQuote
= FALSE
;
68 fModeFlags
= fRXPat
->fFlags
| 0x80000000;
72 fMatchCloseParen
= -1;
74 fLastSetLiteral
= U_SENTINEL
;
76 if (U_SUCCESS(status
) && U_FAILURE(rxp
->fDeferredStatus
)) {
77 status
= rxp
->fDeferredStatus
;
81 static const UChar chAmp
= 0x26; // '&'
82 static const UChar chDash
= 0x2d; // '-'
85 //------------------------------------------------------------------------------
89 //------------------------------------------------------------------------------
90 RegexCompile::~RegexCompile() {
91 delete fCaptureName
; // Normally will be NULL, but can exist if pattern
92 // compilation stops with a syntax error.
95 static inline void addCategory(UnicodeSet
*set
, int32_t value
, UErrorCode
& ec
) {
96 set
->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK
, value
, ec
));
99 //------------------------------------------------------------------------------
101 // Compile regex pattern. The state machine for rexexp pattern parsing is here.
102 // The state tables are hand-written in the file regexcst.txt,
103 // and converted to the form used here by a perl
104 // script regexcst.pl
106 //------------------------------------------------------------------------------
107 void RegexCompile::compile(
108 const UnicodeString
&pat
, // Source pat to be compiled.
109 UParseError
&pp
, // Error position info
110 UErrorCode
&e
) // Error Code
112 fRXPat
->fPatternString
= new UnicodeString(pat
);
113 UText patternText
= UTEXT_INITIALIZER
;
114 utext_openConstUnicodeString(&patternText
, fRXPat
->fPatternString
, &e
);
117 compile(&patternText
, pp
, e
);
118 utext_close(&patternText
);
123 // compile, UText mode
124 // All the work is actually done here.
126 void RegexCompile::compile(
127 UText
*pat
, // Source pat to be compiled.
128 UParseError
&pp
, // Error position info
129 UErrorCode
&e
) // Error Code
134 fStack
[fStackPtr
] = 0;
136 if (U_FAILURE(*fStatus
)) {
140 // There should be no pattern stuff in the RegexPattern object. They can not be reused.
141 U_ASSERT(fRXPat
->fPattern
== NULL
|| utext_nativeLength(fRXPat
->fPattern
) == 0);
143 // Prepare the RegexPattern object to receive the compiled pattern.
144 fRXPat
->fPattern
= utext_clone(fRXPat
->fPattern
, pat
, FALSE
, TRUE
, fStatus
);
145 if (U_FAILURE(*fStatus
)) {
148 fRXPat
->fStaticSets
= RegexStaticSets::gStaticSets
->fPropSets
;
149 fRXPat
->fStaticSets8
= RegexStaticSets::gStaticSets
->fPropSets8
;
152 // Initialize the pattern scanning state machine
153 fPatternLength
= utext_nativeLength(pat
);
155 const RegexTableEl
*tableEl
;
157 // UREGEX_LITERAL force entire pattern to be treated as a literal string.
158 if (fModeFlags
& UREGEX_LITERAL
) {
162 nextChar(fC
); // Fetch the first char from the pattern string.
165 // Main loop for the regex pattern parsing state machine.
166 // Runs once per state transition.
167 // Each time through optionally performs, depending on the state table,
168 // - an advance to the the next pattern char
169 // - an action to be performed.
170 // - pushing or popping a state to/from the local state return stack.
171 // file regexcst.txt is the source for the state table. The logic behind
172 // recongizing the pattern syntax is there, not here.
175 // Bail out if anything has gone wrong.
176 // Regex pattern parsing stops on the first error encountered.
177 if (U_FAILURE(*fStatus
)) {
181 U_ASSERT(state
!= 0);
183 // Find the state table element that matches the input char from the pattern, or the
184 // class of the input character. Start with the first table row for this
185 // state, then linearly scan forward until we find a row that matches the
186 // character. The last row for each state always matches all characters, so
187 // the search will stop there, if not before.
189 tableEl
= &gRuleParseStateTable
[state
];
190 REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d) state=%s ",
191 fC
.fChar
, fLineNum
, fCharNum
, RegexStateNames
[state
]));
193 for (;;) { // loop through table rows belonging to this state, looking for one
194 // that matches the current input char.
195 REGEX_SCAN_DEBUG_PRINTF(("."));
196 if (tableEl
->fCharClass
< 127 && fC
.fQuoted
== FALSE
&& tableEl
->fCharClass
== fC
.fChar
) {
197 // Table row specified an individual character, not a set, and
198 // the input character is not quoted, and
199 // the input character matched it.
202 if (tableEl
->fCharClass
== 255) {
203 // Table row specified default, match anything character class.
206 if (tableEl
->fCharClass
== 254 && fC
.fQuoted
) {
207 // Table row specified "quoted" and the char was quoted.
210 if (tableEl
->fCharClass
== 253 && fC
.fChar
== (UChar32
)-1) {
211 // Table row specified eof and we hit eof on the input.
215 if (tableEl
->fCharClass
>= 128 && tableEl
->fCharClass
< 240 && // Table specs a char class &&
216 fC
.fQuoted
== FALSE
&& // char is not escaped &&
217 fC
.fChar
!= (UChar32
)-1) { // char is not EOF
218 U_ASSERT(tableEl
->fCharClass
<= 137);
219 if (RegexStaticSets::gStaticSets
->fRuleSets
[tableEl
->fCharClass
-128].contains(fC
.fChar
)) {
220 // Table row specified a character class, or set of characters,
221 // and the current char matches it.
226 // No match on this row, advance to the next row for this state,
229 REGEX_SCAN_DEBUG_PRINTF(("\n"));
232 // We've found the row of the state table that matches the current input
233 // character from the rules string.
234 // Perform any action specified by this row in the state table.
235 if (doParseActions(tableEl
->fAction
) == FALSE
) {
236 // Break out of the state machine loop if the
237 // the action signalled some kind of error, or
238 // the action was to exit, occurs on normal end-of-rules-input.
242 if (tableEl
->fPushState
!= 0) {
244 if (fStackPtr
>= kStackSize
) {
245 error(U_REGEX_INTERNAL_ERROR
);
246 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
249 fStack
[fStackPtr
] = tableEl
->fPushState
;
253 // NextChar. This is where characters are actually fetched from the pattern.
254 // Happens under control of the 'n' tag in the state table.
256 if (tableEl
->fNextChar
) {
260 // Get the next state from the table entry, or from the
261 // state stack if the next state was specified as "pop".
262 if (tableEl
->fNextState
!= 255) {
263 state
= tableEl
->fNextState
;
265 state
= fStack
[fStackPtr
];
268 // state stack underflow
269 // This will occur if the user pattern has mis-matched parentheses,
270 // with extra close parens.
273 error(U_REGEX_MISMATCHED_PAREN
);
279 if (U_FAILURE(*fStatus
)) {
280 // Bail out if the pattern had errors.
281 // Set stack cleanup: a successful compile would have left it empty,
282 // but errors can leave temporary sets hanging around.
283 while (!fSetStack
.empty()) {
284 delete (UnicodeSet
*)fSetStack
.pop();
290 // The pattern has now been read and processed, and the compiled code generated.
294 // The pattern's fFrameSize so far has accumulated the requirements for
295 // storage for capture parentheses, counters, etc. that are encountered
296 // in the pattern. Add space for the two variables that are always
297 // present in the saved state: the input string position (int64_t) and
298 // the position in the compiled pattern.
300 allocateStackData(RESTACKFRAME_HDRCOUNT
);
303 // Optimization pass 1: NOPs, back-references, and case-folding
308 // Get bounds for the minimum and maximum length of a string that this
309 // pattern can match. Used to avoid looking for matches in strings that
312 fRXPat
->fMinMatchLen
= minMatchLength(3, fRXPat
->fCompiledPat
->size()-1);
315 // Optimization pass 2: match start type
320 // Set up fast latin-1 range sets
322 int32_t numSets
= fRXPat
->fSets
->size();
323 fRXPat
->fSets8
= new Regex8BitSet
[numSets
];
324 // Null pointer check.
325 if (fRXPat
->fSets8
== NULL
) {
326 e
= *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
330 for (i
=0; i
<numSets
; i
++) {
331 UnicodeSet
*s
= (UnicodeSet
*)fRXPat
->fSets
->elementAt(i
);
332 fRXPat
->fSets8
[i
].init(s
);
341 //------------------------------------------------------------------------------
343 // doParseAction Do some action during regex pattern parsing.
344 // Called by the parse state machine.
346 // Generation of the match engine PCode happens here, or
347 // in functions called from the parse actions defined here.
350 //------------------------------------------------------------------------------
351 UBool
RegexCompile::doParseActions(int32_t action
)
353 UBool returnVal
= TRUE
;
355 switch ((Regex_PatternParseAction
)action
) {
358 // Start of pattern compiles to:
359 //0 SAVE 2 Fall back to position of FAIL
361 //2 FAIL Stop if we ever reach here.
362 //3 NOP Dummy, so start of pattern looks the same as
363 // the start of an ( grouping.
364 //4 NOP Resreved, will be replaced by a save if there are
365 // OR | operators at the top level
366 appendOp(URX_STATE_SAVE
, 2);
367 appendOp(URX_JMP
, 3);
368 appendOp(URX_FAIL
, 0);
370 // Standard open nonCapture paren action emits the two NOPs and
371 // sets up the paren stack frame.
372 doParseActions(doOpenNonCaptureParen
);
376 // We've scanned to the end of the pattern
377 // The end of pattern compiles to:
379 // which will stop the runtime match engine.
380 // Encountering end of pattern also behaves like a close paren,
381 // and forces fixups of the State Save at the beginning of the compiled pattern
382 // and of any OR operations at the top level.
385 if (fParenStack
.size() > 0) {
386 // Missing close paren in pattern.
387 error(U_REGEX_MISMATCHED_PAREN
);
390 // add the END operation to the compiled pattern.
391 appendOp(URX_END
, 0);
393 // Terminate the pattern compilation state machine.
400 // Scanning a '|', as in (A|B)
402 // Generate code for any pending literals preceding the '|'
405 // Insert a SAVE operation at the start of the pattern section preceding
406 // this OR at this level. This SAVE will branch the match forward
407 // to the right hand side of the OR in the event that the left hand
408 // side fails to match and backtracks. Locate the position for the
409 // save from the location on the top of the parentheses stack.
410 int32_t savePosition
= fParenStack
.popi();
411 int32_t op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(savePosition
);
412 U_ASSERT(URX_TYPE(op
) == URX_NOP
); // original contents of reserved location
413 op
= buildOp(URX_STATE_SAVE
, fRXPat
->fCompiledPat
->size()+1);
414 fRXPat
->fCompiledPat
->setElementAt(op
, savePosition
);
416 // Append an JMP operation into the compiled pattern. The operand for
417 // the JMP will eventually be the location following the ')' for the
418 // group. This will be patched in later, when the ')' is encountered.
419 appendOp(URX_JMP
, 0);
421 // Push the position of the newly added JMP op onto the parentheses stack.
422 // This registers if for fixup when this block's close paren is encountered.
423 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
);
425 // Append a NOP to the compiled pattern. This is the slot reserved
426 // for a SAVE in the event that there is yet another '|' following
428 appendOp(URX_NOP
, 0);
429 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
);
434 case doBeginNamedCapture
:
435 // Scanning (?<letter.
436 // The first letter of the name will come through again under doConinueNamedCapture.
437 fCaptureName
= new UnicodeString();
438 if (fCaptureName
== NULL
) {
439 error(U_MEMORY_ALLOCATION_ERROR
);
443 case doContinueNamedCapture
:
444 fCaptureName
->append(fC
.fChar
);
447 case doBadNamedCapture
:
448 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
451 case doOpenCaptureParen
:
452 // Open Capturing Paren, possibly named.
454 // - NOP, which later may be replaced by a save-state if the
455 // parenthesized group gets a * quantifier, followed by
456 // - START_CAPTURE n where n is stack frame offset to the capture group variables.
457 // - NOP, which may later be replaced by a save-state if there
458 // is an '|' alternation within the parens.
460 // Each capture group gets three slots in the save stack frame:
461 // 0: Capture Group start position (in input string being matched.)
462 // 1: Capture Group end position.
463 // 2: Start of Match-in-progress.
464 // The first two locations are for a completed capture group, and are
465 // referred to by back references and the like.
466 // The third location stores the capture start position when an START_CAPTURE is
467 // encountered. This will be promoted to a completed capture when (and if) the corresponding
468 // END_CAPTURE is encountered.
471 appendOp(URX_NOP
, 0);
472 int32_t varsLoc
= allocateStackData(3); // Reserve three slots in match stack frame.
473 appendOp(URX_START_CAPTURE
, varsLoc
);
474 appendOp(URX_NOP
, 0);
476 // On the Parentheses stack, start a new frame and add the postions
477 // of the two NOPs. Depending on what follows in the pattern, the
478 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
479 // address of the end of the parenthesized group.
480 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
481 fParenStack
.push(capturing
, *fStatus
); // Frame type.
482 fParenStack
.push(fRXPat
->fCompiledPat
->size()-3, *fStatus
); // The first NOP location
483 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP loc
485 // Save the mapping from group number to stack frame variable position.
486 fRXPat
->fGroupMap
->addElement(varsLoc
, *fStatus
);
488 // If this is a named capture group, add the name->group number mapping.
489 if (fCaptureName
!= NULL
) {
490 int32_t groupNumber
= fRXPat
->fGroupMap
->size();
491 int32_t previousMapping
= uhash_puti(fRXPat
->fNamedCaptureMap
, fCaptureName
, groupNumber
, fStatus
);
492 fCaptureName
= NULL
; // hash table takes ownership of the name (key) string.
493 if (previousMapping
> 0 && U_SUCCESS(*fStatus
)) {
494 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
500 case doOpenNonCaptureParen
:
501 // Open non-caputuring (grouping only) Paren.
503 // - NOP, which later may be replaced by a save-state if the
504 // parenthesized group gets a * quantifier, followed by
505 // - NOP, which may later be replaced by a save-state if there
506 // is an '|' alternation within the parens.
509 appendOp(URX_NOP
, 0);
510 appendOp(URX_NOP
, 0);
512 // On the Parentheses stack, start a new frame and add the postions
514 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
515 fParenStack
.push(plain
, *fStatus
); // Begin a new frame.
516 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The first NOP location
517 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP loc
522 case doOpenAtomicParen
:
523 // Open Atomic Paren. (?>
525 // - NOP, which later may be replaced if the parenthesized group
526 // has a quantifier, followed by
527 // - STO_SP save state stack position, so it can be restored at the ")"
528 // - NOP, which may later be replaced by a save-state if there
529 // is an '|' alternation within the parens.
532 appendOp(URX_NOP
, 0);
533 int32_t varLoc
= allocateData(1); // Reserve a data location for saving the state stack ptr.
534 appendOp(URX_STO_SP
, varLoc
);
535 appendOp(URX_NOP
, 0);
537 // On the Parentheses stack, start a new frame and add the postions
538 // of the two NOPs. Depending on what follows in the pattern, the
539 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
540 // address of the end of the parenthesized group.
541 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
542 fParenStack
.push(atomic
, *fStatus
); // Frame type.
543 fParenStack
.push(fRXPat
->fCompiledPat
->size()-3, *fStatus
); // The first NOP
544 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP
549 case doOpenLookAhead
:
550 // Positive Look-ahead (?= stuff )
552 // Note: Addition of transparent input regions, with the need to
553 // restore the original regions when failing out of a lookahead
554 // block, complicated this sequence. Some conbined opcodes
555 // might make sense - or might not, lookahead aren't that common.
557 // Caution: min match length optimization knows about this
558 // sequence; don't change without making updates there too.
561 // 1 START_LA dataLoc Saves SP, Input Pos
562 // 2. STATE_SAVE 4 on failure of lookahead, goto 4
563 // 3 JMP 6 continue ...
565 // 4. LA_END Look Ahead failed. Restore regions.
566 // 5. BACKTRACK and back track again.
568 // 6. NOP reserved for use by quantifiers on the block.
569 // Look-ahead can't have quantifiers, but paren stack
570 // compile time conventions require the slot anyhow.
571 // 7. NOP may be replaced if there is are '|' ops in the block.
572 // 8. code for parenthesized stuff.
575 // Two data slots are reserved, for saving the stack ptr and the input position.
578 int32_t dataLoc
= allocateData(2);
579 appendOp(URX_LA_START
, dataLoc
);
580 appendOp(URX_STATE_SAVE
, fRXPat
->fCompiledPat
->size()+ 2);
581 appendOp(URX_JMP
, fRXPat
->fCompiledPat
->size()+ 3);
582 appendOp(URX_LA_END
, dataLoc
);
583 appendOp(URX_BACKTRACK
, 0);
584 appendOp(URX_NOP
, 0);
585 appendOp(URX_NOP
, 0);
587 // On the Parentheses stack, start a new frame and add the postions
589 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
590 fParenStack
.push(lookAhead
, *fStatus
); // Frame type.
591 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The first NOP location
592 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP location
596 case doOpenLookAheadNeg
:
597 // Negated Lookahead. (?! stuff )
599 // 1. START_LA dataloc
600 // 2. SAVE_STATE 7 // Fail within look-ahead block restores to this state,
601 // // which continues with the match.
602 // 3. NOP // Std. Open Paren sequence, for possible '|'
603 // 4. code for parenthesized stuff.
604 // 5. END_LA // Cut back stack, remove saved state from step 2.
605 // 6. BACKTRACK // code in block succeeded, so neg. lookahead fails.
606 // 7. END_LA // Restore match region, in case look-ahead was using
607 // an alternate (transparent) region.
610 int32_t dataLoc
= allocateData(2);
611 appendOp(URX_LA_START
, dataLoc
);
612 appendOp(URX_STATE_SAVE
, 0); // dest address will be patched later.
613 appendOp(URX_NOP
, 0);
615 // On the Parentheses stack, start a new frame and add the postions
616 // of the StateSave and NOP.
617 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
618 fParenStack
.push(negLookAhead
, *fStatus
); // Frame type
619 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The STATE_SAVE location
620 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP location
622 // Instructions #5 - #7 will be added when the ')' is encountered.
626 case doOpenLookBehind
:
628 // Compile a (?<= look-behind open paren.
631 // 0 URX_LB_START dataLoc
632 // 1 URX_LB_CONT dataLoc
635 // 4 URX_NOP Standard '(' boilerplate.
636 // 5 URX_NOP Reserved slot for use with '|' ops within (block).
637 // 6 <code for LookBehind expression>
638 // 7 URX_LB_END dataLoc # Check match len, restore input len
639 // 8 URX_LA_END dataLoc # Restore stack, input pos
641 // Allocate a block of matcher data, to contain (when running a match)
642 // 0: Stack ptr on entry
643 // 1: Input Index on entry
644 // 2: Start index of match current match attempt.
645 // 3: Original Input String len.
647 // Generate match code for any pending literals.
650 // Allocate data space
651 int32_t dataLoc
= allocateData(4);
654 appendOp(URX_LB_START
, dataLoc
);
657 appendOp(URX_LB_CONT
, dataLoc
);
658 appendOp(URX_RESERVED_OP
, 0); // MinMatchLength. To be filled later.
659 appendOp(URX_RESERVED_OP
, 0); // MaxMatchLength. To be filled later.
662 appendOp(URX_NOP
, 0);
663 appendOp(URX_NOP
, 0);
665 // On the Parentheses stack, start a new frame and add the postions
666 // of the URX_LB_CONT and the NOP.
667 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
668 fParenStack
.push(lookBehind
, *fStatus
); // Frame type
669 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The first NOP location
670 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The 2nd NOP location
672 // The final two instructions will be added when the ')' is encountered.
677 case doOpenLookBehindNeg
:
679 // Compile a (?<! negated look-behind open paren.
682 // 0 URX_LB_START dataLoc # Save entry stack, input len
683 // 1 URX_LBN_CONT dataLoc # Iterate possible match positions
687 // 5 URX_NOP Standard '(' boilerplate.
688 // 6 URX_NOP Reserved slot for use with '|' ops within (block).
689 // 7 <code for LookBehind expression>
690 // 8 URX_LBN_END dataLoc # Check match len, cause a FAIL
693 // Allocate a block of matcher data, to contain (when running a match)
694 // 0: Stack ptr on entry
695 // 1: Input Index on entry
696 // 2: Start index of match current match attempt.
697 // 3: Original Input String len.
699 // Generate match code for any pending literals.
702 // Allocate data space
703 int32_t dataLoc
= allocateData(4);
706 appendOp(URX_LB_START
, dataLoc
);
709 appendOp(URX_LBN_CONT
, dataLoc
);
710 appendOp(URX_RESERVED_OP
, 0); // MinMatchLength. To be filled later.
711 appendOp(URX_RESERVED_OP
, 0); // MaxMatchLength. To be filled later.
712 appendOp(URX_RESERVED_OP
, 0); // Continue Loc. To be filled later.
715 appendOp(URX_NOP
, 0);
716 appendOp(URX_NOP
, 0);
718 // On the Parentheses stack, start a new frame and add the postions
719 // of the URX_LB_CONT and the NOP.
720 fParenStack
.push(fModeFlags
, *fStatus
); // Match mode state
721 fParenStack
.push(lookBehindN
, *fStatus
); // Frame type
722 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The first NOP location
723 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The 2nd NOP location
725 // The final two instructions will be added when the ')' is encountered.
729 case doConditionalExpr
:
730 // Conditionals such as (?(1)a:b)
732 // Perl inline-condtionals. (?{perl code}a|b) We're not perl, no way to do them.
733 error(U_REGEX_UNIMPLEMENTED
);
739 if (fParenStack
.size() <= 0) {
740 // Extra close paren, or missing open paren.
741 error(U_REGEX_MISMATCHED_PAREN
);
749 case doBadOpenParenType
:
751 error(U_REGEX_RULE_SYNTAX
);
755 case doMismatchedParenErr
:
756 error(U_REGEX_MISMATCHED_PAREN
);
760 // Normal '+' compiles to
761 // 1. stuff to be repeated (already built)
765 // Or, if the item to be repeated can match a zero length string,
766 // 1. STO_INP_LOC data-loc
767 // 2. body of stuff to be repeated
772 // Or, if the item to be repeated is simple
773 // 1. Item to be repeated.
774 // 2. LOOP_SR_I set number (assuming repeated item is a set ref)
775 // 3. LOOP_C stack location
777 int32_t topLoc
= blockTopLoc(FALSE
); // location of item #1
780 // Check for simple constructs, which may get special optimized code.
781 if (topLoc
== fRXPat
->fCompiledPat
->size() - 1) {
782 int32_t repeatedOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(topLoc
);
784 if (URX_TYPE(repeatedOp
) == URX_SETREF
) {
785 // Emit optimized code for [char set]+
786 appendOp(URX_LOOP_SR_I
, URX_VAL(repeatedOp
));
787 frameLoc
= allocateStackData(1);
788 appendOp(URX_LOOP_C
, frameLoc
);
792 if (URX_TYPE(repeatedOp
) == URX_DOTANY
||
793 URX_TYPE(repeatedOp
) == URX_DOTANY_ALL
||
794 URX_TYPE(repeatedOp
) == URX_DOTANY_UNIX
) {
795 // Emit Optimized code for .+ operations.
796 int32_t loopOpI
= buildOp(URX_LOOP_DOT_I
, 0);
797 if (URX_TYPE(repeatedOp
) == URX_DOTANY_ALL
) {
798 // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
801 if (fModeFlags
& UREGEX_UNIX_LINES
) {
805 frameLoc
= allocateStackData(1);
806 appendOp(URX_LOOP_C
, frameLoc
);
814 // Check for minimum match length of zero, which requires
815 // extra loop-breaking code.
816 if (minMatchLength(topLoc
, fRXPat
->fCompiledPat
->size()-1) == 0) {
817 // Zero length match is possible.
818 // Emit the code sequence that can handle it.
820 frameLoc
= allocateStackData(1);
822 int32_t op
= buildOp(URX_STO_INP_LOC
, frameLoc
);
823 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
);
825 appendOp(URX_JMP_SAV_X
, topLoc
+1);
827 // Simpler code when the repeated body must match something non-empty
828 appendOp(URX_JMP_SAV
, topLoc
);
834 // Non-greedy '+?' compiles to
835 // 1. stuff to be repeated (already built)
839 int32_t topLoc
= blockTopLoc(FALSE
);
840 appendOp(URX_STATE_SAVE
, topLoc
);
846 // Normal (greedy) ? quantifier.
849 // 2. body of optional block
851 // Insert the state save into the compiled pattern, and we're done.
853 int32_t saveStateLoc
= blockTopLoc(TRUE
);
854 int32_t saveStateOp
= buildOp(URX_STATE_SAVE
, fRXPat
->fCompiledPat
->size());
855 fRXPat
->fCompiledPat
->setElementAt(saveStateOp
, saveStateLoc
);
860 // Non-greedy ?? quantifier
863 // 2. body of optional block
867 // This code is less than ideal, with two jmps instead of one, because we can only
868 // insert one instruction at the top of the block being iterated.
870 int32_t jmp1_loc
= blockTopLoc(TRUE
);
871 int32_t jmp2_loc
= fRXPat
->fCompiledPat
->size();
873 int32_t jmp1_op
= buildOp(URX_JMP
, jmp2_loc
+1);
874 fRXPat
->fCompiledPat
->setElementAt(jmp1_op
, jmp1_loc
);
876 appendOp(URX_JMP
, jmp2_loc
+2);
878 appendOp(URX_STATE_SAVE
, jmp1_loc
+1);
884 // Normal (greedy) * quantifier.
887 // 2. body of stuff being iterated over
891 // Or, if the body is a simple [Set],
892 // 1. LOOP_SR_I set number
893 // 2. LOOP_C stack location
896 // Or if this is a .*
897 // 1. LOOP_DOT_I (. matches all mode flag)
898 // 2. LOOP_C stack location
900 // Or, if the body can match a zero-length string, to inhibit infinite loops,
902 // 2. STO_INP_LOC data-loc
907 // location of item #1, the STATE_SAVE
908 int32_t topLoc
= blockTopLoc(FALSE
);
909 int32_t dataLoc
= -1;
911 // Check for simple *, where the construct being repeated
912 // compiled to single opcode, and might be optimizable.
913 if (topLoc
== fRXPat
->fCompiledPat
->size() - 1) {
914 int32_t repeatedOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(topLoc
);
916 if (URX_TYPE(repeatedOp
) == URX_SETREF
) {
917 // Emit optimized code for a [char set]*
918 int32_t loopOpI
= buildOp(URX_LOOP_SR_I
, URX_VAL(repeatedOp
));
919 fRXPat
->fCompiledPat
->setElementAt(loopOpI
, topLoc
);
920 dataLoc
= allocateStackData(1);
921 appendOp(URX_LOOP_C
, dataLoc
);
925 if (URX_TYPE(repeatedOp
) == URX_DOTANY
||
926 URX_TYPE(repeatedOp
) == URX_DOTANY_ALL
||
927 URX_TYPE(repeatedOp
) == URX_DOTANY_UNIX
) {
928 // Emit Optimized code for .* operations.
929 int32_t loopOpI
= buildOp(URX_LOOP_DOT_I
, 0);
930 if (URX_TYPE(repeatedOp
) == URX_DOTANY_ALL
) {
931 // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
934 if ((fModeFlags
& UREGEX_UNIX_LINES
) != 0) {
937 fRXPat
->fCompiledPat
->setElementAt(loopOpI
, topLoc
);
938 dataLoc
= allocateStackData(1);
939 appendOp(URX_LOOP_C
, dataLoc
);
944 // Emit general case code for this *
945 // The optimizations did not apply.
947 int32_t saveStateLoc
= blockTopLoc(TRUE
);
948 int32_t jmpOp
= buildOp(URX_JMP_SAV
, saveStateLoc
+1);
950 // Check for minimum match length of zero, which requires
951 // extra loop-breaking code.
952 if (minMatchLength(saveStateLoc
, fRXPat
->fCompiledPat
->size()-1) == 0) {
953 insertOp(saveStateLoc
);
954 dataLoc
= allocateStackData(1);
956 int32_t op
= buildOp(URX_STO_INP_LOC
, dataLoc
);
957 fRXPat
->fCompiledPat
->setElementAt(op
, saveStateLoc
+1);
958 jmpOp
= buildOp(URX_JMP_SAV_X
, saveStateLoc
+2);
961 // Locate the position in the compiled pattern where the match will continue
962 // after completing the *. (4 or 5 in the comment above)
963 int32_t continueLoc
= fRXPat
->fCompiledPat
->size()+1;
965 // Put together the save state op and store it into the compiled code.
966 int32_t saveStateOp
= buildOp(URX_STATE_SAVE
, continueLoc
);
967 fRXPat
->fCompiledPat
->setElementAt(saveStateOp
, saveStateLoc
);
969 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
975 // Non-greedy *? quantifier
978 // 2. body of stuff being iterated over
982 int32_t jmpLoc
= blockTopLoc(TRUE
); // loc 1.
983 int32_t saveLoc
= fRXPat
->fCompiledPat
->size(); // loc 3.
984 int32_t jmpOp
= buildOp(URX_JMP
, saveLoc
);
985 fRXPat
->fCompiledPat
->setElementAt(jmpOp
, jmpLoc
);
986 appendOp(URX_STATE_SAVE
, jmpLoc
+1);
992 // The '{' opening an interval quantifier was just scanned.
993 // Init the counter varaiables that will accumulate the values as the digits
999 case doIntevalLowerDigit
:
1000 // Scanned a digit from the lower value of an {lower,upper} interval
1002 int32_t digitValue
= u_charDigitValue(fC
.fChar
);
1003 U_ASSERT(digitValue
>= 0);
1004 int64_t val
= (int64_t)fIntervalLow
*10 + digitValue
;
1005 if (val
> INT32_MAX
) {
1006 error(U_REGEX_NUMBER_TOO_BIG
);
1008 fIntervalLow
= (int32_t)val
;
1013 case doIntervalUpperDigit
:
1014 // Scanned a digit from the upper value of an {lower,upper} interval
1016 if (fIntervalUpper
< 0) {
1019 int32_t digitValue
= u_charDigitValue(fC
.fChar
);
1020 U_ASSERT(digitValue
>= 0);
1021 int64_t val
= (int64_t)fIntervalUpper
*10 + digitValue
;
1022 if (val
> INT32_MAX
) {
1023 error(U_REGEX_NUMBER_TOO_BIG
);
1025 fIntervalUpper
= (int32_t)val
;
1030 case doIntervalSame
:
1031 // Scanned a single value interval like {27}. Upper = Lower.
1032 fIntervalUpper
= fIntervalLow
;
1036 // Finished scanning a normal {lower,upper} interval. Generate the code for it.
1037 if (compileInlineInterval() == FALSE
) {
1038 compileInterval(URX_CTR_INIT
, URX_CTR_LOOP
);
1042 case doPossessiveInterval
:
1043 // Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it.
1045 // Remember the loc for the top of the block being looped over.
1046 // (Can not reserve a slot in the compiled pattern at this time, because
1047 // compileInterval needs to reserve also, and blockTopLoc can only reserve
1049 int32_t topLoc
= blockTopLoc(FALSE
);
1051 // Produce normal looping code.
1052 compileInterval(URX_CTR_INIT
, URX_CTR_LOOP
);
1054 // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
1055 // just as if the loop was inclosed in atomic parentheses.
1057 // First the STO_SP before the start of the loop
1060 int32_t varLoc
= allocateData(1); // Reserve a data location for saving the
1061 int32_t op
= buildOp(URX_STO_SP
, varLoc
);
1062 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
);
1064 int32_t loopOp
= (int32_t)fRXPat
->fCompiledPat
->popi();
1065 U_ASSERT(URX_TYPE(loopOp
) == URX_CTR_LOOP
&& URX_VAL(loopOp
) == topLoc
);
1066 loopOp
++; // point LoopOp after the just-inserted STO_SP
1067 fRXPat
->fCompiledPat
->push(loopOp
, *fStatus
);
1069 // Then the LD_SP after the end of the loop
1070 appendOp(URX_LD_SP
, varLoc
);
1076 // Finished scanning a non-greedy {lower,upper}? interval. Generate the code for it.
1077 compileInterval(URX_CTR_INIT_NG
, URX_CTR_LOOP_NG
);
1080 case doIntervalError
:
1081 error(U_REGEX_BAD_INTERVAL
);
1085 // We've just scanned a "normal" character from the pattern,
1086 literalChar(fC
.fChar
);
1090 case doEscapedLiteralChar
:
1091 // We've just scanned an backslashed escaped character with no
1092 // special meaning. It represents itself.
1093 if ((fModeFlags
& UREGEX_ERROR_ON_UNKNOWN_ESCAPES
) != 0 &&
1094 ((fC
.fChar
>= 0x41 && fC
.fChar
<= 0x5A) || // in [A-Z]
1095 (fC
.fChar
>= 0x61 && fC
.fChar
<= 0x7a))) { // in [a-z]
1096 error(U_REGEX_BAD_ESCAPE_SEQUENCE
);
1098 literalChar(fC
.fChar
);
1103 // scanned a ".", match any single character.
1106 if (fModeFlags
& UREGEX_DOTALL
) {
1107 appendOp(URX_DOTANY_ALL
, 0);
1108 } else if (fModeFlags
& UREGEX_UNIX_LINES
) {
1109 appendOp(URX_DOTANY_UNIX
, 0);
1111 appendOp(URX_DOTANY
, 0);
1119 if ( (fModeFlags
& UREGEX_MULTILINE
) == 0 && (fModeFlags
& UREGEX_UNIX_LINES
) == 0) {
1120 appendOp(URX_CARET
, 0);
1121 } else if ((fModeFlags
& UREGEX_MULTILINE
) != 0 && (fModeFlags
& UREGEX_UNIX_LINES
) == 0) {
1122 appendOp(URX_CARET_M
, 0);
1123 } else if ((fModeFlags
& UREGEX_MULTILINE
) == 0 && (fModeFlags
& UREGEX_UNIX_LINES
) != 0) {
1124 appendOp(URX_CARET
, 0); // Only testing true start of input.
1125 } else if ((fModeFlags
& UREGEX_MULTILINE
) != 0 && (fModeFlags
& UREGEX_UNIX_LINES
) != 0) {
1126 appendOp(URX_CARET_M_UNIX
, 0);
1134 if ( (fModeFlags
& UREGEX_MULTILINE
) == 0 && (fModeFlags
& UREGEX_UNIX_LINES
) == 0) {
1135 appendOp(URX_DOLLAR
, 0);
1136 } else if ((fModeFlags
& UREGEX_MULTILINE
) != 0 && (fModeFlags
& UREGEX_UNIX_LINES
) == 0) {
1137 appendOp(URX_DOLLAR_M
, 0);
1138 } else if ((fModeFlags
& UREGEX_MULTILINE
) == 0 && (fModeFlags
& UREGEX_UNIX_LINES
) != 0) {
1139 appendOp(URX_DOLLAR_D
, 0);
1140 } else if ((fModeFlags
& UREGEX_MULTILINE
) != 0 && (fModeFlags
& UREGEX_UNIX_LINES
) != 0) {
1141 appendOp(URX_DOLLAR_MD
, 0);
1148 appendOp(URX_CARET
, 0);
1153 #if UCONFIG_NO_BREAK_ITERATION==1
1154 if (fModeFlags
& UREGEX_UWORD
) {
1155 error(U_UNSUPPORTED_ERROR
);
1159 int32_t op
= (fModeFlags
& UREGEX_UWORD
)? URX_BACKSLASH_BU
: URX_BACKSLASH_B
;
1166 #if UCONFIG_NO_BREAK_ITERATION==1
1167 if (fModeFlags
& UREGEX_UWORD
) {
1168 error(U_UNSUPPORTED_ERROR
);
1172 int32_t op
= (fModeFlags
& UREGEX_UWORD
)? URX_BACKSLASH_BU
: URX_BACKSLASH_B
;
1179 appendOp(URX_BACKSLASH_D
, 1);
1184 appendOp(URX_BACKSLASH_D
, 0);
1189 appendOp(URX_BACKSLASH_G
, 0);
1194 appendOp(URX_BACKSLASH_H
, 1);
1199 appendOp(URX_BACKSLASH_H
, 0);
1204 appendOp(URX_BACKSLASH_R
, 0);
1209 appendOp(URX_STAT_SETREF_N
, URX_ISSPACE_SET
);
1214 appendOp(URX_STATIC_SETREF
, URX_ISSPACE_SET
);
1219 appendOp(URX_BACKSLASH_V
, 1);
1224 appendOp(URX_BACKSLASH_V
, 0);
1229 appendOp(URX_STAT_SETREF_N
, URX_ISWORD_SET
);
1234 appendOp(URX_STATIC_SETREF
, URX_ISWORD_SET
);
1239 appendOp(URX_BACKSLASH_X
, 0);
1245 appendOp(URX_DOLLAR
, 0);
1250 appendOp(URX_BACKSLASH_Z
, 0);
1254 error(U_REGEX_BAD_ESCAPE_SEQUENCE
);
1265 UnicodeSet
*theSet
= scanProp();
1272 UChar32 c
= scanNamedChar();
1279 // BackReference. Somewhat unusual in that the front-end can not completely parse
1280 // the regular expression, because the number of digits to be consumed
1281 // depends on the number of capture groups that have been defined. So
1282 // we have to do it here instead.
1284 int32_t numCaptureGroups
= fRXPat
->fGroupMap
->size();
1285 int32_t groupNum
= 0;
1286 UChar32 c
= fC
.fChar
;
1289 // Loop once per digit, for max allowed number of digits in a back reference.
1290 int32_t digit
= u_charDigitValue(c
);
1291 groupNum
= groupNum
* 10 + digit
;
1292 if (groupNum
>= numCaptureGroups
) {
1296 if (RegexStaticSets::gStaticSets
->fRuleDigitsAlias
->contains(c
) == FALSE
) {
1302 // Scan of the back reference in the source regexp is complete. Now generate
1303 // the compiled code for it.
1304 // Because capture groups can be forward-referenced by back-references,
1305 // we fill the operand with the capture group number. At the end
1306 // of compilation, it will be changed to the variable's location.
1307 U_ASSERT(groupNum
> 0); // Shouldn't happen. '\0' begins an octal escape sequence,
1308 // and shouldn't enter this code path at all.
1310 if (fModeFlags
& UREGEX_CASE_INSENSITIVE
) {
1311 appendOp(URX_BACKREF_I
, groupNum
);
1313 appendOp(URX_BACKREF
, groupNum
);
1318 case doBeginNamedBackRef
:
1319 U_ASSERT(fCaptureName
== NULL
);
1320 fCaptureName
= new UnicodeString
;
1321 if (fCaptureName
== NULL
) {
1322 error(U_MEMORY_ALLOCATION_ERROR
);
1326 case doContinueNamedBackRef
:
1327 fCaptureName
->append(fC
.fChar
);
1330 case doCompleteNamedBackRef
:
1332 int32_t groupNumber
= uhash_geti(fRXPat
->fNamedCaptureMap
, fCaptureName
);
1333 if (groupNumber
== 0) {
1334 // Group name has not been defined.
1335 // Could be a forward reference. If we choose to support them at some
1336 // future time, extra mechanism will be required at this point.
1337 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME
);
1339 // Given the number, handle identically to a \n numbered back reference.
1340 // See comments above, under doBackRef
1342 if (fModeFlags
& UREGEX_CASE_INSENSITIVE
) {
1343 appendOp(URX_BACKREF_I
, groupNumber
);
1345 appendOp(URX_BACKREF
, groupNumber
);
1348 delete fCaptureName
;
1349 fCaptureName
= NULL
;
1353 case doPossessivePlus
:
1354 // Possessive ++ quantifier.
1357 // 2. body of stuff being iterated over
1363 // Note: TODO: This is pretty inefficient. A mass of saved state is built up
1364 // then unconditionally discarded. Perhaps introduce a new opcode. Ticket 6056
1368 int32_t topLoc
= blockTopLoc(TRUE
);
1369 int32_t stoLoc
= allocateData(1); // Reserve the data location for storing save stack ptr.
1370 int32_t op
= buildOp(URX_STO_SP
, stoLoc
);
1371 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
);
1373 // Emit the STATE_SAVE
1374 appendOp(URX_STATE_SAVE
, fRXPat
->fCompiledPat
->size()+2);
1377 appendOp(URX_JMP
, topLoc
+1);
1380 appendOp(URX_LD_SP
, stoLoc
);
1384 case doPossessiveStar
:
1385 // Possessive *+ quantifier.
1389 // 3. body of stuff being iterated over
1393 // TODO: do something to cut back the state stack each time through the loop.
1395 // Reserve two slots at the top of the block.
1396 int32_t topLoc
= blockTopLoc(TRUE
);
1400 int32_t stoLoc
= allocateData(1); // Reserve the data location for storing save stack ptr.
1401 int32_t op
= buildOp(URX_STO_SP
, stoLoc
);
1402 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
);
1404 // Emit the SAVE_STATE 5
1405 int32_t L7
= fRXPat
->fCompiledPat
->size()+1;
1406 op
= buildOp(URX_STATE_SAVE
, L7
);
1407 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
+1);
1409 // Append the JMP operation.
1410 appendOp(URX_JMP
, topLoc
+1);
1412 // Emit the LD_SP loc
1413 appendOp(URX_LD_SP
, stoLoc
);
1417 case doPossessiveOpt
:
1418 // Possessive ?+ quantifier.
1422 // 3. body of optional block
1427 // Reserve two slots at the top of the block.
1428 int32_t topLoc
= blockTopLoc(TRUE
);
1432 int32_t stoLoc
= allocateData(1); // Reserve the data location for storing save stack ptr.
1433 int32_t op
= buildOp(URX_STO_SP
, stoLoc
);
1434 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
);
1436 // Emit the SAVE_STATE
1437 int32_t continueLoc
= fRXPat
->fCompiledPat
->size()+1;
1438 op
= buildOp(URX_STATE_SAVE
, continueLoc
);
1439 fRXPat
->fCompiledPat
->setElementAt(op
, topLoc
+1);
1442 appendOp(URX_LD_SP
, stoLoc
);
1447 case doBeginMatchMode
:
1448 fNewModeFlags
= fModeFlags
;
1449 fSetModeFlag
= TRUE
;
1452 case doMatchMode
: // (?i) and similar
1456 case 0x69: /* 'i' */ bit
= UREGEX_CASE_INSENSITIVE
; break;
1457 case 0x64: /* 'd' */ bit
= UREGEX_UNIX_LINES
; break;
1458 case 0x6d: /* 'm' */ bit
= UREGEX_MULTILINE
; break;
1459 case 0x73: /* 's' */ bit
= UREGEX_DOTALL
; break;
1460 case 0x75: /* 'u' */ bit
= 0; /* Unicode casing */ break;
1461 case 0x77: /* 'w' */ bit
= UREGEX_UWORD
; break;
1462 case 0x78: /* 'x' */ bit
= UREGEX_COMMENTS
; break;
1463 case 0x2d: /* '-' */ fSetModeFlag
= FALSE
; break;
1465 U_ASSERT(FALSE
); // Should never happen. Other chars are filtered out
1469 fNewModeFlags
|= bit
;
1471 fNewModeFlags
&= ~bit
;
1476 case doSetMatchMode
:
1477 // Emit code to match any pending literals, using the not-yet changed match mode.
1480 // We've got a (?i) or similar. The match mode is being changed, but
1481 // the change is not scoped to a parenthesized block.
1482 U_ASSERT(fNewModeFlags
< 0);
1483 fModeFlags
= fNewModeFlags
;
1488 case doMatchModeParen
:
1489 // We've got a (?i: or similar. Begin a parenthesized block, save old
1490 // mode flags so they can be restored at the close of the block.
1493 // - NOP, which later may be replaced by a save-state if the
1494 // parenthesized group gets a * quantifier, followed by
1495 // - NOP, which may later be replaced by a save-state if there
1496 // is an '|' alternation within the parens.
1499 appendOp(URX_NOP
, 0);
1500 appendOp(URX_NOP
, 0);
1502 // On the Parentheses stack, start a new frame and add the postions
1503 // of the two NOPs (a normal non-capturing () frame, except for the
1504 // saving of the orignal mode flags.)
1505 fParenStack
.push(fModeFlags
, *fStatus
);
1506 fParenStack
.push(flags
, *fStatus
); // Frame Marker
1507 fParenStack
.push(fRXPat
->fCompiledPat
->size()-2, *fStatus
); // The first NOP
1508 fParenStack
.push(fRXPat
->fCompiledPat
->size()-1, *fStatus
); // The second NOP
1510 // Set the current mode flags to the new values.
1511 U_ASSERT(fNewModeFlags
< 0);
1512 fModeFlags
= fNewModeFlags
;
1517 error(U_REGEX_INVALID_FLAG
);
1520 case doSuppressComments
:
1521 // We have just scanned a '(?'. We now need to prevent the character scanner from
1522 // treating a '#' as a to-the-end-of-line comment.
1523 // (This Perl compatibility just gets uglier and uglier to do...)
1524 fEOLComments
= FALSE
;
1530 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1537 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1542 case doSetBackslash_s
:
1544 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1545 set
->addAll(*RegexStaticSets::gStaticSets
->fPropSets
[URX_ISSPACE_SET
]);
1549 case doSetBackslash_S
:
1551 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1552 UnicodeSet
SSet(*RegexStaticSets::gStaticSets
->fPropSets
[URX_ISSPACE_SET
]);
1558 case doSetBackslash_d
:
1560 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1561 // TODO - make a static set, ticket 6058.
1562 addCategory(set
, U_GC_ND_MASK
, *fStatus
);
1566 case doSetBackslash_D
:
1568 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1570 // TODO - make a static set, ticket 6058.
1571 digits
.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK
, U_GC_ND_MASK
, *fStatus
);
1572 digits
.complement();
1573 set
->addAll(digits
);
1577 case doSetBackslash_h
:
1579 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1581 h
.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK
, U_GC_ZS_MASK
, *fStatus
);
1582 h
.add((UChar32
)9); // Tab
1587 case doSetBackslash_H
:
1589 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1591 h
.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK
, U_GC_ZS_MASK
, *fStatus
);
1592 h
.add((UChar32
)9); // Tab
1598 case doSetBackslash_v
:
1600 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1601 set
->add((UChar32
)0x0a, (UChar32
)0x0d); // add range
1602 set
->add((UChar32
)0x85);
1603 set
->add((UChar32
)0x2028, (UChar32
)0x2029);
1607 case doSetBackslash_V
:
1609 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1611 v
.add((UChar32
)0x0a, (UChar32
)0x0d); // add range
1612 v
.add((UChar32
)0x85);
1613 v
.add((UChar32
)0x2028, (UChar32
)0x2029);
1619 case doSetBackslash_w
:
1621 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1622 set
->addAll(*RegexStaticSets::gStaticSets
->fPropSets
[URX_ISWORD_SET
]);
1626 case doSetBackslash_W
:
1628 UnicodeSet
*set
= (UnicodeSet
*)fSetStack
.peek();
1629 UnicodeSet
SSet(*RegexStaticSets::gStaticSets
->fPropSets
[URX_ISWORD_SET
]);
1637 fSetStack
.push(new UnicodeSet(), *fStatus
);
1638 fSetOpStack
.push(setStart
, *fStatus
);
1639 if ((fModeFlags
& UREGEX_CASE_INSENSITIVE
) != 0) {
1640 fSetOpStack
.push(setCaseClose
, *fStatus
);
1644 case doSetBeginDifference1
:
1645 // We have scanned something like [[abc]-[
1646 // Set up a new UnicodeSet for the set beginning with the just-scanned '['
1647 // Push a Difference operator, which will cause the new set to be subtracted from what
1648 // went before once it is created.
1649 setPushOp(setDifference1
);
1650 fSetOpStack
.push(setStart
, *fStatus
);
1651 if ((fModeFlags
& UREGEX_CASE_INSENSITIVE
) != 0) {
1652 fSetOpStack
.push(setCaseClose
, *fStatus
);
1656 case doSetBeginIntersection1
:
1657 // We have scanned something like [[abc]&[
1658 // Need both the '&' operator and the open '[' operator.
1659 setPushOp(setIntersection1
);
1660 fSetOpStack
.push(setStart
, *fStatus
);
1661 if ((fModeFlags
& UREGEX_CASE_INSENSITIVE
) != 0) {
1662 fSetOpStack
.push(setCaseClose
, *fStatus
);
1666 case doSetBeginUnion
:
1667 // We have scanned something like [[abc][
1668 // Need to handle the union operation explicitly [[abc] | [
1669 setPushOp(setUnion
);
1670 fSetOpStack
.push(setStart
, *fStatus
);
1671 if ((fModeFlags
& UREGEX_CASE_INSENSITIVE
) != 0) {
1672 fSetOpStack
.push(setCaseClose
, *fStatus
);
1676 case doSetDifference2
:
1677 // We have scanned something like [abc--
1678 // Consider this to unambiguously be a set difference operator.
1679 setPushOp(setDifference2
);
1683 // Have encountered the ']' that closes a set.
1684 // Force the evaluation of any pending operations within this set,
1685 // leave the completed set on the top of the set stack.
1687 U_ASSERT(fSetOpStack
.peeki()==setStart
);
1693 // Finished a complete set expression, including all nested sets.
1694 // The close bracket has already triggered clearing out pending set operators,
1695 // the operator stack should be empty and the operand stack should have just
1696 // one entry, the result set.
1697 U_ASSERT(fSetOpStack
.empty());
1698 UnicodeSet
*theSet
= (UnicodeSet
*)fSetStack
.pop();
1699 U_ASSERT(fSetStack
.empty());
1704 case doSetIntersection2
:
1705 // Have scanned something like [abc&&
1706 setPushOp(setIntersection2
);
1710 // Union the just-scanned literal character into the set being built.
1711 // This operation is the highest precedence set operation, so we can always do
1712 // it immediately, without waiting to see what follows. It is necessary to perform
1713 // any pending '-' or '&' operation first, because these have the same precedence
1714 // as union-ing in a literal'
1717 UnicodeSet
*s
= (UnicodeSet
*)fSetStack
.peek();
1719 fLastSetLiteral
= fC
.fChar
;
1723 case doSetLiteralEscaped
:
1724 // A back-slash escaped literal character was encountered.
1725 // Processing is the same as with setLiteral, above, with the addition of
1726 // the optional check for errors on escaped ASCII letters.
1728 if ((fModeFlags
& UREGEX_ERROR_ON_UNKNOWN_ESCAPES
) != 0 &&
1729 ((fC
.fChar
>= 0x41 && fC
.fChar
<= 0x5A) || // in [A-Z]
1730 (fC
.fChar
>= 0x61 && fC
.fChar
<= 0x7a))) { // in [a-z]
1731 error(U_REGEX_BAD_ESCAPE_SEQUENCE
);
1734 UnicodeSet
*s
= (UnicodeSet
*)fSetStack
.peek();
1736 fLastSetLiteral
= fC
.fChar
;
1740 case doSetNamedChar
:
1741 // Scanning a \N{UNICODE CHARACTER NAME}
1742 // Aside from the source of the character, the processing is identical to doSetLiteral,
1745 UChar32 c
= scanNamedChar();
1747 UnicodeSet
*s
= (UnicodeSet
*)fSetStack
.peek();
1749 fLastSetLiteral
= c
;
1753 case doSetNamedRange
:
1754 // We have scanned literal-\N{CHAR NAME}. Add the range to the set.
1755 // The left character is already in the set, and is saved in fLastSetLiteral.
1756 // The right side needs to be picked up, the scan is at the 'N'.
1757 // Lower Limit > Upper limit being an error matches both Java
1758 // and ICU UnicodeSet behavior.
1760 UChar32 c
= scanNamedChar();
1761 if (U_SUCCESS(*fStatus
) && (fLastSetLiteral
== U_SENTINEL
|| fLastSetLiteral
> c
)) {
1762 error(U_REGEX_INVALID_RANGE
);
1764 UnicodeSet
*s
= (UnicodeSet
*)fSetStack
.peek();
1765 s
->add(fLastSetLiteral
, c
);
1766 fLastSetLiteral
= c
;
1772 // Scanned a '^' at the start of a set.
1773 // Push the negation operator onto the set op stack.
1774 // A twist for case-insensitive matching:
1775 // the case closure operation must happen _before_ negation.
1776 // But the case closure operation will already be on the stack if it's required.
1777 // This requires checking for case closure, and swapping the stack order
1778 // if it is present.
1780 int32_t tosOp
= fSetOpStack
.peeki();
1781 if (tosOp
== setCaseClose
) {
1783 fSetOpStack
.push(setNegation
, *fStatus
);
1784 fSetOpStack
.push(setCaseClose
, *fStatus
);
1786 fSetOpStack
.push(setNegation
, *fStatus
);
1791 case doSetNoCloseError
:
1792 error(U_REGEX_MISSING_CLOSE_BRACKET
);
1796 error(U_REGEX_RULE_SYNTAX
); // -- or && at the end of a set. Illegal.
1799 case doSetPosixProp
:
1801 UnicodeSet
*s
= scanPosixProp();
1803 UnicodeSet
*tos
= (UnicodeSet
*)fSetStack
.peek();
1806 } // else error. scanProp() reported the error status already.
1811 // Scanned a \p \P within [brackets].
1813 UnicodeSet
*s
= scanProp();
1815 UnicodeSet
*tos
= (UnicodeSet
*)fSetStack
.peek();
1818 } // else error. scanProp() reported the error status already.
1824 // We have scanned literal-literal. Add the range to the set.
1825 // The left character is already in the set, and is saved in fLastSetLiteral.
1826 // The right side is the current character.
1827 // Lower Limit > Upper limit being an error matches both Java
1828 // and ICU UnicodeSet behavior.
1831 if (fLastSetLiteral
== U_SENTINEL
|| fLastSetLiteral
> fC
.fChar
) {
1832 error(U_REGEX_INVALID_RANGE
);
1834 UnicodeSet
*s
= (UnicodeSet
*)fSetStack
.peek();
1835 s
->add(fLastSetLiteral
, fC
.fChar
);
1841 error(U_REGEX_INTERNAL_ERROR
);
1845 if (U_FAILURE(*fStatus
)) {
1854 //------------------------------------------------------------------------------
1856 // literalChar We've encountered a literal character from the pattern,
1857 // or an escape sequence that reduces to a character.
1858 // Add it to the string containing all literal chars/strings from
1861 //------------------------------------------------------------------------------
1862 void RegexCompile::literalChar(UChar32 c
) {
1863 fLiteralChars
.append(c
);
1867 //------------------------------------------------------------------------------
1869 // fixLiterals When compiling something that can follow a literal
1870 // string in a pattern, emit the code to match the
1871 // accumulated literal string.
1873 // Optionally, split the last char of the string off into
1874 // a single "ONE_CHAR" operation, so that quantifiers can
1875 // apply to that char alone. Example: abc*
1876 // The * must apply to the 'c' only.
1878 //------------------------------------------------------------------------------
1879 void RegexCompile::fixLiterals(UBool split
) {
1881 // If no literal characters have been scanned but not yet had code generated
1882 // for them, nothing needs to be done.
1883 if (fLiteralChars
.length() == 0) {
1887 int32_t indexOfLastCodePoint
= fLiteralChars
.moveIndex32(fLiteralChars
.length(), -1);
1888 UChar32 lastCodePoint
= fLiteralChars
.char32At(indexOfLastCodePoint
);
1890 // Split: We need to ensure that the last item in the compiled pattern
1891 // refers only to the last literal scanned in the pattern, so that
1892 // quantifiers (*, +, etc.) affect only it, and not a longer string.
1893 // Split before case folding for case insensitive matches.
1896 fLiteralChars
.truncate(indexOfLastCodePoint
);
1897 fixLiterals(FALSE
); // Recursive call, emit code to match the first part of the string.
1898 // Note that the truncated literal string may be empty, in which case
1899 // nothing will be emitted.
1901 literalChar(lastCodePoint
); // Re-add the last code point as if it were a new literal.
1902 fixLiterals(FALSE
); // Second recursive call, code for the final code point.
1906 // If we are doing case-insensitive matching, case fold the string. This may expand
1907 // the string, e.g. the German sharp-s turns into "ss"
1908 if (fModeFlags
& UREGEX_CASE_INSENSITIVE
) {
1909 fLiteralChars
.foldCase();
1910 indexOfLastCodePoint
= fLiteralChars
.moveIndex32(fLiteralChars
.length(), -1);
1911 lastCodePoint
= fLiteralChars
.char32At(indexOfLastCodePoint
);
1914 if (indexOfLastCodePoint
== 0) {
1915 // Single character, emit a URX_ONECHAR op to match it.
1916 if ((fModeFlags
& UREGEX_CASE_INSENSITIVE
) &&
1917 u_hasBinaryProperty(lastCodePoint
, UCHAR_CASE_SENSITIVE
)) {
1918 appendOp(URX_ONECHAR_I
, lastCodePoint
);
1920 appendOp(URX_ONECHAR
, lastCodePoint
);
1923 // Two or more chars, emit a URX_STRING to match them.
1924 if (fLiteralChars
.length() > 0x00ffffff || fRXPat
->fLiteralText
.length() > 0x00ffffff) {
1925 error(U_REGEX_PATTERN_TOO_BIG
);
1927 if (fModeFlags
& UREGEX_CASE_INSENSITIVE
) {
1928 appendOp(URX_STRING_I
, fRXPat
->fLiteralText
.length());
1930 // TODO here: add optimization to split case sensitive strings of length two
1931 // into two single char ops, for efficiency.
1932 appendOp(URX_STRING
, fRXPat
->fLiteralText
.length());
1934 appendOp(URX_STRING_LEN
, fLiteralChars
.length());
1936 // Add this string into the accumulated strings of the compiled pattern.
1937 fRXPat
->fLiteralText
.append(fLiteralChars
);
1940 fLiteralChars
.remove();
1944 int32_t RegexCompile::buildOp(int32_t type
, int32_t val
) {
1945 if (U_FAILURE(*fStatus
)) {
1948 if (type
< 0 || type
> 255) {
1950 error(U_REGEX_INTERNAL_ERROR
);
1951 type
= URX_RESERVED_OP
;
1953 if (val
> 0x00ffffff) {
1955 error(U_REGEX_INTERNAL_ERROR
);
1959 if (!(type
== URX_RESERVED_OP_N
|| type
== URX_RESERVED_OP
)) {
1961 error(U_REGEX_INTERNAL_ERROR
);
1964 if (URX_TYPE(val
) != 0xff) {
1966 error(U_REGEX_INTERNAL_ERROR
);
1969 type
= URX_RESERVED_OP_N
;
1971 return (type
<< 24) | val
;
1975 //------------------------------------------------------------------------------
1977 // appendOp() Append a new instruction onto the compiled pattern
1978 // Includes error checking, limiting the size of the
1979 // pattern to lengths that can be represented in the
1980 // 24 bit operand field of an instruction.
1982 //------------------------------------------------------------------------------
1983 void RegexCompile::appendOp(int32_t op
) {
1984 if (U_FAILURE(*fStatus
)) {
1987 fRXPat
->fCompiledPat
->addElement(op
, *fStatus
);
1988 if ((fRXPat
->fCompiledPat
->size() > 0x00fffff0) && U_SUCCESS(*fStatus
)) {
1989 error(U_REGEX_PATTERN_TOO_BIG
);
1993 void RegexCompile::appendOp(int32_t type
, int32_t val
) {
1994 appendOp(buildOp(type
, val
));
1998 //------------------------------------------------------------------------------
2000 // insertOp() Insert a slot for a new opcode into the already
2001 // compiled pattern code.
2003 // Fill the slot with a NOP. Our caller will replace it
2004 // with what they really wanted.
2006 //------------------------------------------------------------------------------
2007 void RegexCompile::insertOp(int32_t where
) {
2008 UVector64
*code
= fRXPat
->fCompiledPat
;
2009 U_ASSERT(where
>0 && where
< code
->size());
2011 int32_t nop
= buildOp(URX_NOP
, 0);
2012 code
->insertElementAt(nop
, where
, *fStatus
);
2014 // Walk through the pattern, looking for any ops with targets that
2015 // were moved down by the insert. Fix them.
2017 for (loc
=0; loc
<code
->size(); loc
++) {
2018 int32_t op
= (int32_t)code
->elementAti(loc
);
2019 int32_t opType
= URX_TYPE(op
);
2020 int32_t opValue
= URX_VAL(op
);
2021 if ((opType
== URX_JMP
||
2022 opType
== URX_JMPX
||
2023 opType
== URX_STATE_SAVE
||
2024 opType
== URX_CTR_LOOP
||
2025 opType
== URX_CTR_LOOP_NG
||
2026 opType
== URX_JMP_SAV
||
2027 opType
== URX_JMP_SAV_X
||
2028 opType
== URX_RELOC_OPRND
) && opValue
> where
) {
2029 // Target location for this opcode is after the insertion point and
2030 // needs to be incremented to adjust for the insertion.
2032 op
= buildOp(opType
, opValue
);
2033 code
->setElementAt(op
, loc
);
2037 // Now fix up the parentheses stack. All positive values in it are locations in
2038 // the compiled pattern. (Negative values are frame boundaries, and don't need fixing.)
2039 for (loc
=0; loc
<fParenStack
.size(); loc
++) {
2040 int32_t x
= fParenStack
.elementAti(loc
);
2041 U_ASSERT(x
< code
->size());
2044 fParenStack
.setElementAt(x
, loc
);
2048 if (fMatchCloseParen
> where
) {
2051 if (fMatchOpenParen
> where
) {
2057 //------------------------------------------------------------------------------
2059 // allocateData() Allocate storage in the matcher's static data area.
2060 // Return the index for the newly allocated data.
2061 // The storage won't actually exist until we are running a match
2062 // operation, but the storage indexes are inserted into various
2063 // opcodes while compiling the pattern.
2065 //------------------------------------------------------------------------------
2066 int32_t RegexCompile::allocateData(int32_t size
) {
2067 if (U_FAILURE(*fStatus
)) {
2070 if (size
<= 0 || size
> 0x100 || fRXPat
->fDataSize
< 0) {
2071 error(U_REGEX_INTERNAL_ERROR
);
2074 int32_t dataIndex
= fRXPat
->fDataSize
;
2075 fRXPat
->fDataSize
+= size
;
2076 if (fRXPat
->fDataSize
>= 0x00fffff0) {
2077 error(U_REGEX_INTERNAL_ERROR
);
2083 //------------------------------------------------------------------------------
2085 // allocateStackData() Allocate space in the back-tracking stack frame.
2086 // Return the index for the newly allocated data.
2087 // The frame indexes are inserted into various
2088 // opcodes while compiling the pattern, meaning that frame
2089 // size must be restricted to the size that will fit
2090 // as an operand (24 bits).
2092 //------------------------------------------------------------------------------
2093 int32_t RegexCompile::allocateStackData(int32_t size
) {
2094 if (U_FAILURE(*fStatus
)) {
2097 if (size
<= 0 || size
> 0x100 || fRXPat
->fFrameSize
< 0) {
2098 error(U_REGEX_INTERNAL_ERROR
);
2101 int32_t dataIndex
= fRXPat
->fFrameSize
;
2102 fRXPat
->fFrameSize
+= size
;
2103 if (fRXPat
->fFrameSize
>= 0x00fffff0) {
2104 error(U_REGEX_PATTERN_TOO_BIG
);
2110 //------------------------------------------------------------------------------
2112 // blockTopLoc() Find or create a location in the compiled pattern
2113 // at the start of the operation or block that has
2114 // just been compiled. Needed when a quantifier (* or
2115 // whatever) appears, and we need to add an operation
2116 // at the start of the thing being quantified.
2118 // (Parenthesized Blocks) have a slot with a NOP that
2119 // is reserved for this purpose. .* or similar don't
2120 // and a slot needs to be added.
2122 // parameter reserveLoc : TRUE - ensure that there is space to add an opcode
2123 // at the returned location.
2124 // FALSE - just return the address,
2125 // do not reserve a location there.
2127 //------------------------------------------------------------------------------
2128 int32_t RegexCompile::blockTopLoc(UBool reserveLoc
) {
2130 fixLiterals(TRUE
); // Emit code for any pending literals.
2131 // If last item was a string, emit separate op for the its last char.
2132 if (fRXPat
->fCompiledPat
->size() == fMatchCloseParen
)
2134 // The item just processed is a parenthesized block.
2135 theLoc
= fMatchOpenParen
; // A slot is already reserved for us.
2136 U_ASSERT(theLoc
> 0);
2137 U_ASSERT(URX_TYPE(((uint32_t)fRXPat
->fCompiledPat
->elementAti(theLoc
))) == URX_NOP
);
2140 // Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
2141 // No slot for STATE_SAVE was pre-reserved in the compiled code.
2142 // We need to make space now.
2143 theLoc
= fRXPat
->fCompiledPat
->size()-1;
2144 int32_t opAtTheLoc
= (int32_t)fRXPat
->fCompiledPat
->elementAti(theLoc
);
2145 if (URX_TYPE(opAtTheLoc
) == URX_STRING_LEN
) {
2146 // Strings take two opcode, we want the position of the first one.
2147 // We can have a string at this point if a single character case-folded to two.
2151 int32_t nop
= buildOp(URX_NOP
, 0);
2152 fRXPat
->fCompiledPat
->insertElementAt(nop
, theLoc
, *fStatus
);
2160 //------------------------------------------------------------------------------
2162 // handleCloseParen When compiling a close paren, we need to go back
2163 // and fix up any JMP or SAVE operations within the
2164 // parenthesized block that need to target the end
2165 // of the block. The locations of these are kept on
2166 // the paretheses stack.
2168 // This function is called both when encountering a
2169 // real ) and at the end of the pattern.
2171 //------------------------------------------------------------------------------
2172 void RegexCompile::handleCloseParen() {
2175 if (fParenStack
.size() <= 0) {
2176 error(U_REGEX_MISMATCHED_PAREN
);
2180 // Emit code for any pending literals.
2183 // Fixup any operations within the just-closed parenthesized group
2184 // that need to reference the end of the (block).
2185 // (The first one popped from the stack is an unused slot for
2186 // alternation (OR) state save, but applying the fixup to it does no harm.)
2188 patIdx
= fParenStack
.popi();
2190 // value < 0 flags the start of the frame on the paren stack.
2193 U_ASSERT(patIdx
>0 && patIdx
<= fRXPat
->fCompiledPat
->size());
2194 patOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(patIdx
);
2195 U_ASSERT(URX_VAL(patOp
) == 0); // Branch target for JMP should not be set.
2196 patOp
|= fRXPat
->fCompiledPat
->size(); // Set it now.
2197 fRXPat
->fCompiledPat
->setElementAt(patOp
, patIdx
);
2198 fMatchOpenParen
= patIdx
;
2201 // At the close of any parenthesized block, restore the match mode flags to
2202 // the value they had at the open paren. Saved value is
2203 // at the top of the paren stack.
2204 fModeFlags
= fParenStack
.popi();
2205 U_ASSERT(fModeFlags
< 0);
2207 // DO any additional fixups, depending on the specific kind of
2208 // parentesized grouping this is
2213 // No additional fixups required.
2214 // (Grouping-only parentheses)
2217 // Capturing Parentheses.
2218 // Insert a End Capture op into the pattern.
2219 // The frame offset of the variables for this cg is obtained from the
2220 // start capture op and put it into the end-capture op.
2222 int32_t captureOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
+1);
2223 U_ASSERT(URX_TYPE(captureOp
) == URX_START_CAPTURE
);
2225 int32_t frameVarLocation
= URX_VAL(captureOp
);
2226 appendOp(URX_END_CAPTURE
, frameVarLocation
);
2230 // Atomic Parenthesis.
2231 // Insert a LD_SP operation to restore the state stack to the position
2232 // it was when the atomic parens were entered.
2234 int32_t stoOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
+1);
2235 U_ASSERT(URX_TYPE(stoOp
) == URX_STO_SP
);
2236 int32_t stoLoc
= URX_VAL(stoOp
);
2237 appendOp(URX_LD_SP
, stoLoc
);
2243 int32_t startOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
-5);
2244 U_ASSERT(URX_TYPE(startOp
) == URX_LA_START
);
2245 int32_t dataLoc
= URX_VAL(startOp
);
2246 appendOp(URX_LA_END
, dataLoc
);
2252 // See comment at doOpenLookAheadNeg
2253 int32_t startOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
-1);
2254 U_ASSERT(URX_TYPE(startOp
) == URX_LA_START
);
2255 int32_t dataLoc
= URX_VAL(startOp
);
2256 appendOp(URX_LA_END
, dataLoc
);
2257 appendOp(URX_BACKTRACK
, 0);
2258 appendOp(URX_LA_END
, dataLoc
);
2260 // Patch the URX_SAVE near the top of the block.
2261 // The destination of the SAVE is the final LA_END that was just added.
2262 int32_t saveOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
);
2263 U_ASSERT(URX_TYPE(saveOp
) == URX_STATE_SAVE
);
2264 int32_t dest
= fRXPat
->fCompiledPat
->size()-1;
2265 saveOp
= buildOp(URX_STATE_SAVE
, dest
);
2266 fRXPat
->fCompiledPat
->setElementAt(saveOp
, fMatchOpenParen
);
2272 // See comment at doOpenLookBehind.
2274 // Append the URX_LB_END and URX_LA_END to the compiled pattern.
2275 int32_t startOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
-4);
2276 U_ASSERT(URX_TYPE(startOp
) == URX_LB_START
);
2277 int32_t dataLoc
= URX_VAL(startOp
);
2278 appendOp(URX_LB_END
, dataLoc
);
2279 appendOp(URX_LA_END
, dataLoc
);
2281 // Determine the min and max bounds for the length of the
2282 // string that the pattern can match.
2283 // An unbounded upper limit is an error.
2284 int32_t patEnd
= fRXPat
->fCompiledPat
->size() - 1;
2285 int32_t minML
= minMatchLength(fMatchOpenParen
, patEnd
);
2286 int32_t maxML
= maxMatchLength(fMatchOpenParen
, patEnd
);
2287 if (URX_TYPE(maxML
) != 0) {
2288 error(U_REGEX_LOOK_BEHIND_LIMIT
);
2291 if (maxML
== INT32_MAX
) {
2292 error(U_REGEX_LOOK_BEHIND_LIMIT
);
2295 U_ASSERT(minML
<= maxML
);
2297 // Insert the min and max match len bounds into the URX_LB_CONT op that
2298 // appears at the top of the look-behind block, at location fMatchOpenParen+1
2299 fRXPat
->fCompiledPat
->setElementAt(minML
, fMatchOpenParen
-2);
2300 fRXPat
->fCompiledPat
->setElementAt(maxML
, fMatchOpenParen
-1);
2309 // See comment at doOpenLookBehindNeg.
2311 // Append the URX_LBN_END to the compiled pattern.
2312 int32_t startOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(fMatchOpenParen
-5);
2313 U_ASSERT(URX_TYPE(startOp
) == URX_LB_START
);
2314 int32_t dataLoc
= URX_VAL(startOp
);
2315 appendOp(URX_LBN_END
, dataLoc
);
2317 // Determine the min and max bounds for the length of the
2318 // string that the pattern can match.
2319 // An unbounded upper limit is an error.
2320 int32_t patEnd
= fRXPat
->fCompiledPat
->size() - 1;
2321 int32_t minML
= minMatchLength(fMatchOpenParen
, patEnd
);
2322 int32_t maxML
= maxMatchLength(fMatchOpenParen
, patEnd
);
2323 if (URX_TYPE(maxML
) != 0) {
2324 error(U_REGEX_LOOK_BEHIND_LIMIT
);
2327 if (maxML
== INT32_MAX
) {
2328 error(U_REGEX_LOOK_BEHIND_LIMIT
);
2331 U_ASSERT(minML
<= maxML
);
2333 // Insert the min and max match len bounds into the URX_LB_CONT op that
2334 // appears at the top of the look-behind block, at location fMatchOpenParen+1
2335 fRXPat
->fCompiledPat
->setElementAt(minML
, fMatchOpenParen
-3);
2336 fRXPat
->fCompiledPat
->setElementAt(maxML
, fMatchOpenParen
-2);
2338 // Insert the pattern location to continue at after a successful match
2339 // as the last operand of the URX_LBN_CONT
2340 int32_t op
= buildOp(URX_RELOC_OPRND
, fRXPat
->fCompiledPat
->size());
2341 fRXPat
->fCompiledPat
->setElementAt(op
, fMatchOpenParen
-1);
2351 // remember the next location in the compiled pattern.
2352 // The compilation of Quantifiers will look at this to see whether its looping
2353 // over a parenthesized block or a single item
2354 fMatchCloseParen
= fRXPat
->fCompiledPat
->size();
2359 //------------------------------------------------------------------------------
2361 // compileSet Compile the pattern operations for a reference to a
2364 //------------------------------------------------------------------------------
2365 void RegexCompile::compileSet(UnicodeSet
*theSet
)
2367 if (theSet
== NULL
) {
2370 // Remove any strings from the set.
2371 // There shoudn't be any, but just in case.
2372 // (Case Closure can add them; if we had a simple case closure avaialble that
2373 // ignored strings, that would be better.)
2374 theSet
->removeAllStrings();
2375 int32_t setSize
= theSet
->size();
2380 // Set of no elements. Always fails to match.
2381 appendOp(URX_BACKTRACK
, 0);
2388 // The set contains only a single code point. Put it into
2389 // the compiled pattern as a single char operation rather
2390 // than a set, and discard the set itself.
2391 literalChar(theSet
->charAt(0));
2398 // The set contains two or more chars. (the normal case)
2399 // Put it into the compiled pattern as a set.
2400 int32_t setNumber
= fRXPat
->fSets
->size();
2401 fRXPat
->fSets
->addElement(theSet
, *fStatus
);
2402 appendOp(URX_SETREF
, setNumber
);
2408 //------------------------------------------------------------------------------
2410 // compileInterval Generate the code for a {min, max} style interval quantifier.
2411 // Except for the specific opcodes used, the code is the same
2412 // for all three types (greedy, non-greedy, possessive) of
2413 // intervals. The opcodes are supplied as parameters.
2414 // (There are two sets of opcodes - greedy & possessive use the
2415 // same ones, while non-greedy has it's own.)
2417 // The code for interval loops has this form:
2418 // 0 CTR_INIT counter loc (in stack frame)
2419 // 1 5 patt address of CTR_LOOP at bottom of block
2421 // 3 max count (-1 for unbounded)
2422 // 4 ... block to be iterated over
2426 //------------------------------------------------------------------------------
2427 void RegexCompile::compileInterval(int32_t InitOp
, int32_t LoopOp
)
2429 // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
2430 // four slots in the compiled code. Reserve them.
2431 int32_t topOfBlock
= blockTopLoc(TRUE
);
2432 insertOp(topOfBlock
);
2433 insertOp(topOfBlock
);
2434 insertOp(topOfBlock
);
2436 // The operands for the CTR_INIT opcode include the index in the matcher data
2437 // of the counter. Allocate it now. There are two data items
2438 // counterLoc --> Loop counter
2439 // +1 --> Input index (for breaking non-progressing loops)
2440 // (Only present if unbounded upper limit on loop)
2441 int32_t dataSize
= fIntervalUpper
< 0 ? 2 : 1;
2442 int32_t counterLoc
= allocateStackData(dataSize
);
2444 int32_t op
= buildOp(InitOp
, counterLoc
);
2445 fRXPat
->fCompiledPat
->setElementAt(op
, topOfBlock
);
2447 // The second operand of CTR_INIT is the location following the end of the loop.
2448 // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
2449 // compilation of something later on causes the code to grow and the target
2450 // position to move.
2451 int32_t loopEnd
= fRXPat
->fCompiledPat
->size();
2452 op
= buildOp(URX_RELOC_OPRND
, loopEnd
);
2453 fRXPat
->fCompiledPat
->setElementAt(op
, topOfBlock
+1);
2455 // Followed by the min and max counts.
2456 fRXPat
->fCompiledPat
->setElementAt(fIntervalLow
, topOfBlock
+2);
2457 fRXPat
->fCompiledPat
->setElementAt(fIntervalUpper
, topOfBlock
+3);
2459 // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op.
2460 // Goes at end of the block being looped over, so just append to the code so far.
2461 appendOp(LoopOp
, topOfBlock
);
2463 if ((fIntervalLow
& 0xff000000) != 0 ||
2464 (fIntervalUpper
> 0 && (fIntervalUpper
& 0xff000000) != 0)) {
2465 error(U_REGEX_NUMBER_TOO_BIG
);
2468 if (fIntervalLow
> fIntervalUpper
&& fIntervalUpper
!= -1) {
2469 error(U_REGEX_MAX_LT_MIN
);
2475 UBool
RegexCompile::compileInlineInterval() {
2476 if (fIntervalUpper
> 10 || fIntervalUpper
< fIntervalLow
) {
2477 // Too big to inline. Fail, which will cause looping code to be generated.
2478 // (Upper < Lower picks up unbounded upper and errors, both.)
2482 int32_t topOfBlock
= blockTopLoc(FALSE
);
2483 if (fIntervalUpper
== 0) {
2484 // Pathological case. Attempt no matches, as if the block doesn't exist.
2485 // Discard the generated code for the block.
2486 // If the block included parens, discard the info pertaining to them as well.
2487 fRXPat
->fCompiledPat
->setSize(topOfBlock
);
2488 if (fMatchOpenParen
>= topOfBlock
) {
2489 fMatchOpenParen
= -1;
2491 if (fMatchCloseParen
>= topOfBlock
) {
2492 fMatchCloseParen
= -1;
2497 if (topOfBlock
!= fRXPat
->fCompiledPat
->size()-1 && fIntervalUpper
!= 1) {
2498 // The thing being repeated is not a single op, but some
2499 // more complex block. Do it as a loop, not inlines.
2500 // Note that things "repeated" a max of once are handled as inline, because
2501 // the one copy of the code already generated is just fine.
2505 // Pick up the opcode that is to be repeated
2507 int32_t op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(topOfBlock
);
2509 // Compute the pattern location where the inline sequence
2510 // will end, and set up the state save op that will be needed.
2512 int32_t endOfSequenceLoc
= fRXPat
->fCompiledPat
->size()-1
2513 + fIntervalUpper
+ (fIntervalUpper
-fIntervalLow
);
2514 int32_t saveOp
= buildOp(URX_STATE_SAVE
, endOfSequenceLoc
);
2515 if (fIntervalLow
== 0) {
2516 insertOp(topOfBlock
);
2517 fRXPat
->fCompiledPat
->setElementAt(saveOp
, topOfBlock
);
2522 // Loop, emitting the op for the thing being repeated each time.
2523 // Loop starts at 1 because one instance of the op already exists in the pattern,
2524 // it was put there when it was originally encountered.
2526 for (i
=1; i
<fIntervalUpper
; i
++ ) {
2527 if (i
>= fIntervalLow
) {
2537 //------------------------------------------------------------------------------
2539 // caseInsensitiveStart given a single code point from a pattern string, determine the
2540 // set of characters that could potentially begin a case-insensitive
2541 // match of a string beginning with that character, using full Unicode
2542 // case insensitive matching.
2544 // This is used in optimizing find().
2546 // closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
2547 // misses cases like this:
2548 // A string from the pattern begins with 'ss' (although all we know
2549 // in this context is that it begins with 's')
2550 // The pattern could match a string beginning with a German sharp-s
2552 // To the ordinary case closure for a character c, we add all other
2553 // characters cx where the case closure of cx incudes a string form that begins
2554 // with the original character c.
2556 // This function could be made smarter. The full pattern string is available
2557 // and it would be possible to verify that the extra characters being added
2558 // to the starting set fully match, rather than having just a first-char of the
2559 // folded form match.
2561 //------------------------------------------------------------------------------
2562 void RegexCompile::findCaseInsensitiveStarters(UChar32 c
, UnicodeSet
*starterChars
) {
2564 // Machine Generated below.
2565 // It may need updating with new versions of Unicode.
2566 // Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed.
2567 // The update tool is here: svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
2569 // Machine Generated Data. Do not hand edit.
2570 static const UChar32 RECaseFixCodePoints
[] = {
2571 0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc,
2572 0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565,
2573 0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07,
2574 0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61,
2575 0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000};
2577 static const int16_t RECaseFixStringOffsets
[] = {
2578 0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10,
2579 0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f,
2580 0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43,
2581 0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57,
2582 0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0};
2584 static const int16_t RECaseFixCounts
[] = {
2585 0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1,
2586 0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1,
2587 0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2588 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2589 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0};
2591 static const UChar RECaseFixData
[] = {
2592 0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf,
2593 0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3,
2594 0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3,
2595 0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0x1fe3,
2596 0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0xfb14,
2597 0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83,
2598 0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90,
2599 0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95,
2600 0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2,
2601 0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7,
2602 0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0};
2604 // End of machine generated data.
2606 if (u_hasBinaryProperty(c
, UCHAR_CASE_SENSITIVE
)) {
2607 UChar32 caseFoldedC
= u_foldCase(c
, U_FOLD_CASE_DEFAULT
);
2608 starterChars
->set(caseFoldedC
, caseFoldedC
);
2611 for (i
=0; RECaseFixCodePoints
[i
]<c
; i
++) {
2612 // Simple linear search through the sorted list of interesting code points.
2615 if (RECaseFixCodePoints
[i
] == c
) {
2616 int32_t dataIndex
= RECaseFixStringOffsets
[i
];
2617 int32_t numCharsToAdd
= RECaseFixCounts
[i
];
2618 UChar32 cpToAdd
= 0;
2619 for (int32_t j
=0; j
<numCharsToAdd
; j
++) {
2620 U16_NEXT_UNSAFE(RECaseFixData
, dataIndex
, cpToAdd
);
2621 starterChars
->add(cpToAdd
);
2625 starterChars
->closeOver(USET_CASE_INSENSITIVE
);
2626 starterChars
->removeAllStrings();
2628 // Not a cased character. Just return it alone.
2629 starterChars
->set(c
, c
);
2636 //------------------------------------------------------------------------------
2638 // matchStartType Determine how a match can start.
2639 // Used to optimize find() operations.
2641 // Operation is very similar to minMatchLength(). Walk the compiled
2642 // pattern, keeping an on-going minimum-match-length. For any
2643 // op where the min match coming in is zero, add that ops possible
2644 // starting matches to the possible starts for the overall pattern.
2646 //------------------------------------------------------------------------------
2647 void RegexCompile::matchStartType() {
2648 if (U_FAILURE(*fStatus
)) {
2653 int32_t loc
; // Location in the pattern of the current op being processed.
2654 int32_t op
; // The op being processed
2655 int32_t opType
; // The opcode type of the op
2656 int32_t currentLen
= 0; // Minimum length of a match to this point (loc) in the pattern
2657 int32_t numInitialStrings
= 0; // Number of strings encountered that could match at start.
2659 UBool atStart
= TRUE
; // True if no part of the pattern yet encountered
2660 // could have advanced the position in a match.
2661 // (Maximum match length so far == 0)
2663 // forwardedLength is a vector holding minimum-match-length values that
2664 // are propagated forward in the pattern by JMP or STATE_SAVE operations.
2665 // It must be one longer than the pattern being checked because some ops
2666 // will jmp to a end-of-block+1 location from within a block, and we must
2667 // count those when checking the block.
2668 int32_t end
= fRXPat
->fCompiledPat
->size();
2669 UVector32
forwardedLength(end
+1, *fStatus
);
2670 forwardedLength
.setSize(end
+1);
2671 for (loc
=3; loc
<end
; loc
++) {
2672 forwardedLength
.setElementAt(INT32_MAX
, loc
);
2675 for (loc
= 3; loc
<end
; loc
++) {
2676 op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
2677 opType
= URX_TYPE(op
);
2679 // The loop is advancing linearly through the pattern.
2680 // If the op we are now at was the destination of a branch in the pattern,
2681 // and that path has a shorter minimum length than the current accumulated value,
2682 // replace the current accumulated value.
2683 if (forwardedLength
.elementAti(loc
) < currentLen
) {
2684 currentLen
= forwardedLength
.elementAti(loc
);
2685 U_ASSERT(currentLen
>=0 && currentLen
< INT32_MAX
);
2689 // Ops that don't change the total length matched
2690 case URX_RESERVED_OP
:
2693 case URX_STRING_LEN
:
2695 case URX_START_CAPTURE
:
2696 case URX_END_CAPTURE
:
2697 case URX_BACKSLASH_B
:
2698 case URX_BACKSLASH_BU
:
2699 case URX_BACKSLASH_G
:
2700 case URX_BACKSLASH_Z
:
2705 case URX_RELOC_OPRND
:
2706 case URX_STO_INP_LOC
:
2707 case URX_BACKREF
: // BackRef. Must assume that it might be a zero length match
2710 case URX_STO_SP
: // Setup for atomic or possessive blocks. Doesn't change what can match.
2716 fRXPat
->fStartType
= START_START
;
2721 case URX_CARET_M_UNIX
:
2723 fRXPat
->fStartType
= START_LINE
;
2728 if (currentLen
== 0) {
2729 // This character could appear at the start of a match.
2730 // Add it to the set of possible starting characters.
2731 fRXPat
->fInitialChars
->add(URX_VAL(op
));
2732 numInitialStrings
+= 2;
2740 if (currentLen
== 0) {
2741 int32_t sn
= URX_VAL(op
);
2742 U_ASSERT(sn
> 0 && sn
< fRXPat
->fSets
->size());
2743 const UnicodeSet
*s
= (UnicodeSet
*)fRXPat
->fSets
->elementAt(sn
);
2744 fRXPat
->fInitialChars
->addAll(*s
);
2745 numInitialStrings
+= 2;
2752 // [Set]*, like a SETREF, above, in what it can match,
2753 // but may not match at all, so currentLen is not incremented.
2754 if (currentLen
== 0) {
2755 int32_t sn
= URX_VAL(op
);
2756 U_ASSERT(sn
> 0 && sn
< fRXPat
->fSets
->size());
2757 const UnicodeSet
*s
= (UnicodeSet
*)fRXPat
->fSets
->elementAt(sn
);
2758 fRXPat
->fInitialChars
->addAll(*s
);
2759 numInitialStrings
+= 2;
2764 case URX_LOOP_DOT_I
:
2765 if (currentLen
== 0) {
2766 // .* at the start of a pattern.
2767 // Any character can begin the match.
2768 fRXPat
->fInitialChars
->clear();
2769 fRXPat
->fInitialChars
->complement();
2770 numInitialStrings
+= 2;
2776 case URX_STATIC_SETREF
:
2777 if (currentLen
== 0) {
2778 int32_t sn
= URX_VAL(op
);
2779 U_ASSERT(sn
>0 && sn
<URX_LAST_SET
);
2780 const UnicodeSet
*s
= fRXPat
->fStaticSets
[sn
];
2781 fRXPat
->fInitialChars
->addAll(*s
);
2782 numInitialStrings
+= 2;
2790 case URX_STAT_SETREF_N
:
2791 if (currentLen
== 0) {
2792 int32_t sn
= URX_VAL(op
);
2793 const UnicodeSet
*s
= fRXPat
->fStaticSets
[sn
];
2796 fRXPat
->fInitialChars
->addAll(sc
);
2797 numInitialStrings
+= 2;
2805 case URX_BACKSLASH_D
:
2807 if (currentLen
== 0) {
2809 s
.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK
, U_GC_ND_MASK
, *fStatus
);
2810 if (URX_VAL(op
) != 0) {
2813 fRXPat
->fInitialChars
->addAll(s
);
2814 numInitialStrings
+= 2;
2821 case URX_BACKSLASH_H
:
2822 // Horiz white space
2823 if (currentLen
== 0) {
2825 s
.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK
, U_GC_ZS_MASK
, *fStatus
);
2826 s
.add((UChar32
)9); // Tab
2827 if (URX_VAL(op
) != 0) {
2830 fRXPat
->fInitialChars
->addAll(s
);
2831 numInitialStrings
+= 2;
2838 case URX_BACKSLASH_R
: // Any line ending sequence
2839 case URX_BACKSLASH_V
: // Any line ending code point, with optional negation
2840 if (currentLen
== 0) {
2842 s
.add((UChar32
)0x0a, (UChar32
)0x0d); // add range
2843 s
.add((UChar32
)0x85);
2844 s
.add((UChar32
)0x2028, (UChar32
)0x2029);
2845 if (URX_VAL(op
) != 0) {
2846 // Complement option applies to URX_BACKSLASH_V only.
2849 fRXPat
->fInitialChars
->addAll(s
);
2850 numInitialStrings
+= 2;
2859 // Case Insensitive Single Character.
2860 if (currentLen
== 0) {
2861 UChar32 c
= URX_VAL(op
);
2862 if (u_hasBinaryProperty(c
, UCHAR_CASE_SENSITIVE
)) {
2863 UnicodeSet
starters(c
, c
);
2864 starters
.closeOver(USET_CASE_INSENSITIVE
);
2865 // findCaseInsensitiveStarters(c, &starters);
2866 // For ONECHAR_I, no need to worry about text chars that expand on folding into strings.
2867 // The expanded folding can't match the pattern.
2868 fRXPat
->fInitialChars
->addAll(starters
);
2870 // Char has no case variants. Just add it as-is to the
2871 // set of possible starting chars.
2872 fRXPat
->fInitialChars
->add(c
);
2874 numInitialStrings
+= 2;
2881 case URX_BACKSLASH_X
: // Grahpeme Cluster. Minimum is 1, max unbounded.
2882 case URX_DOTANY_ALL
: // . matches one or two.
2884 case URX_DOTANY_UNIX
:
2885 if (currentLen
== 0) {
2886 // These constructs are all bad news when they appear at the start
2887 // of a match. Any character can begin the match.
2888 fRXPat
->fInitialChars
->clear();
2889 fRXPat
->fInitialChars
->complement();
2890 numInitialStrings
+= 2;
2898 loc
++; // Except for extra operand on URX_JMPX, same as URX_JMP.
2902 int32_t jmpDest
= URX_VAL(op
);
2903 if (jmpDest
< loc
) {
2904 // Loop of some kind. Can safely ignore, the worst that will happen
2905 // is that we understate the true minimum length
2906 currentLen
= forwardedLength
.elementAti(loc
+1);
2909 // Forward jump. Propagate the current min length to the target loc of the jump.
2910 U_ASSERT(jmpDest
<= end
+1);
2911 if (forwardedLength
.elementAti(jmpDest
) > currentLen
) {
2912 forwardedLength
.setElementAt(currentLen
, jmpDest
);
2921 // Combo of state save to the next loc, + jmp backwards.
2922 // Net effect on min. length computation is nothing.
2927 // Fails are kind of like a branch, except that the min length was
2928 // propagated already, by the state save.
2929 currentLen
= forwardedLength
.elementAti(loc
+1);
2934 case URX_STATE_SAVE
:
2936 // State Save, for forward jumps, propagate the current minimum.
2937 // of the state save.
2938 int32_t jmpDest
= URX_VAL(op
);
2939 if (jmpDest
> loc
) {
2940 if (currentLen
< forwardedLength
.elementAti(jmpDest
)) {
2941 forwardedLength
.setElementAt(currentLen
, jmpDest
);
2954 int32_t stringLenOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
2955 int32_t stringLen
= URX_VAL(stringLenOp
);
2956 U_ASSERT(URX_TYPE(stringLenOp
) == URX_STRING_LEN
);
2957 U_ASSERT(stringLenOp
>= 2);
2958 if (currentLen
== 0) {
2959 // Add the starting character of this string to the set of possible starting
2960 // characters for this pattern.
2961 int32_t stringStartIdx
= URX_VAL(op
);
2962 UChar32 c
= fRXPat
->fLiteralText
.char32At(stringStartIdx
);
2963 fRXPat
->fInitialChars
->add(c
);
2965 // Remember this string. After the entire pattern has been checked,
2966 // if nothing else is identified that can start a match, we'll use it.
2967 numInitialStrings
++;
2968 fRXPat
->fInitialStringIdx
= stringStartIdx
;
2969 fRXPat
->fInitialStringLen
= stringLen
;
2972 currentLen
+= stringLen
;
2979 // Case-insensitive string. Unlike exact-match strings, we won't
2980 // attempt a string search for possible match positions. But we
2981 // do update the set of possible starting characters.
2983 int32_t stringLenOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
2984 int32_t stringLen
= URX_VAL(stringLenOp
);
2985 U_ASSERT(URX_TYPE(stringLenOp
) == URX_STRING_LEN
);
2986 U_ASSERT(stringLenOp
>= 2);
2987 if (currentLen
== 0) {
2988 // Add the starting character of this string to the set of possible starting
2989 // characters for this pattern.
2990 int32_t stringStartIdx
= URX_VAL(op
);
2991 UChar32 c
= fRXPat
->fLiteralText
.char32At(stringStartIdx
);
2993 findCaseInsensitiveStarters(c
, &s
);
2994 fRXPat
->fInitialChars
->addAll(s
);
2995 numInitialStrings
+= 2; // Matching on an initial string not possible.
2997 currentLen
+= stringLen
;
3003 case URX_CTR_INIT_NG
:
3005 // Loop Init Ops. These don't change the min length, but they are 4 word ops
3006 // so location must be updated accordingly.
3008 // If the min loop count == 0
3009 // move loc forwards to the end of the loop, skipping over the body.
3010 // If the min count is > 0,
3011 // continue normal processing of the body of the loop.
3012 int32_t loopEndLoc
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
+1);
3013 loopEndLoc
= URX_VAL(loopEndLoc
);
3014 int32_t minLoopCount
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
+2);
3015 if (minLoopCount
== 0) {
3016 // Min Loop Count of 0, treat like a forward branch and
3017 // move the current minimum length up to the target
3018 // (end of loop) location.
3019 U_ASSERT(loopEndLoc
<= end
+1);
3020 if (forwardedLength
.elementAti(loopEndLoc
) > currentLen
) {
3021 forwardedLength
.setElementAt(currentLen
, loopEndLoc
);
3024 loc
+=3; // Skips over operands of CTR_INIT
3031 case URX_CTR_LOOP_NG
:
3033 // The jump is conditional, backwards only.
3038 // More loop ops. These state-save to themselves.
3039 // don't change the minimum match
3047 // Look-around. Scan forward until the matching look-ahead end,
3048 // without processing the look-around block. This is overly pessimistic.
3050 // Keep track of the nesting depth of look-around blocks. Boilerplate code for
3051 // lookahead contains two LA_END instructions, so count goes up by two
3052 // for each LA_START.
3053 int32_t depth
= (opType
== URX_LA_START
? 2: 1);
3056 op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
3057 if (URX_TYPE(op
) == URX_LA_START
) {
3060 if (URX_TYPE(op
) == URX_LB_START
) {
3063 if (URX_TYPE(op
) == URX_LA_END
|| URX_TYPE(op
)==URX_LBN_END
) {
3069 if (URX_TYPE(op
) == URX_STATE_SAVE
) {
3070 // Need this because neg lookahead blocks will FAIL to outside
3072 int32_t jmpDest
= URX_VAL(op
);
3073 if (jmpDest
> loc
) {
3074 if (currentLen
< forwardedLength
.elementAti(jmpDest
)) {
3075 forwardedLength
.setElementAt(currentLen
, jmpDest
);
3079 U_ASSERT(loc
<= end
);
3089 U_ASSERT(FALSE
); // Shouldn't get here. These ops should be
3090 // consumed by the scan in URX_LA_START and LB_START
3101 // We have finished walking through the ops. Check whether some forward jump
3102 // propagated a shorter length to location end+1.
3103 if (forwardedLength
.elementAti(end
+1) < currentLen
) {
3104 currentLen
= forwardedLength
.elementAti(end
+1);
3108 fRXPat
->fInitialChars8
->init(fRXPat
->fInitialChars
);
3111 // Sort out what we should check for when looking for candidate match start positions.
3112 // In order of preference,
3113 // 1. Start of input text buffer.
3114 // 2. A literal string.
3115 // 3. Start of line in multi-line mode.
3116 // 4. A single literal character.
3117 // 5. A character from a set of characters.
3119 if (fRXPat
->fStartType
== START_START
) {
3120 // Match only at the start of an input text string.
3121 // start type is already set. We're done.
3122 } else if (numInitialStrings
== 1 && fRXPat
->fMinMatchLen
> 0) {
3123 // Match beginning only with a literal string.
3124 UChar32 c
= fRXPat
->fLiteralText
.char32At(fRXPat
->fInitialStringIdx
);
3125 U_ASSERT(fRXPat
->fInitialChars
->contains(c
));
3126 fRXPat
->fStartType
= START_STRING
;
3127 fRXPat
->fInitialChar
= c
;
3128 } else if (fRXPat
->fStartType
== START_LINE
) {
3129 // Match at start of line in Multi-Line mode.
3130 // Nothing to do here; everything is already set.
3131 } else if (fRXPat
->fMinMatchLen
== 0) {
3132 // Zero length match possible. We could start anywhere.
3133 fRXPat
->fStartType
= START_NO_INFO
;
3134 } else if (fRXPat
->fInitialChars
->size() == 1) {
3135 // All matches begin with the same char.
3136 fRXPat
->fStartType
= START_CHAR
;
3137 fRXPat
->fInitialChar
= fRXPat
->fInitialChars
->charAt(0);
3138 U_ASSERT(fRXPat
->fInitialChar
!= (UChar32
)-1);
3139 } else if (fRXPat
->fInitialChars
->contains((UChar32
)0, (UChar32
)0x10ffff) == FALSE
&&
3140 fRXPat
->fMinMatchLen
> 0) {
3141 // Matches start with a set of character smaller than the set of all chars.
3142 fRXPat
->fStartType
= START_SET
;
3144 // Matches can start with anything
3145 fRXPat
->fStartType
= START_NO_INFO
;
3153 //------------------------------------------------------------------------------
3155 // minMatchLength Calculate the length of the shortest string that could
3156 // match the specified pattern.
3157 // Length is in 16 bit code units, not code points.
3159 // The calculated length may not be exact. The returned
3160 // value may be shorter than the actual minimum; it must
3163 // start and end are the range of p-code operations to be
3164 // examined. The endpoints are included in the range.
3166 //------------------------------------------------------------------------------
3167 int32_t RegexCompile::minMatchLength(int32_t start
, int32_t end
) {
3168 if (U_FAILURE(*fStatus
)) {
3172 U_ASSERT(start
<= end
);
3173 U_ASSERT(end
< fRXPat
->fCompiledPat
->size());
3179 int32_t currentLen
= 0;
3182 // forwardedLength is a vector holding minimum-match-length values that
3183 // are propagated forward in the pattern by JMP or STATE_SAVE operations.
3184 // It must be one longer than the pattern being checked because some ops
3185 // will jmp to a end-of-block+1 location from within a block, and we must
3186 // count those when checking the block.
3187 UVector32
forwardedLength(end
+2, *fStatus
);
3188 forwardedLength
.setSize(end
+2);
3189 for (loc
=start
; loc
<=end
+1; loc
++) {
3190 forwardedLength
.setElementAt(INT32_MAX
, loc
);
3193 for (loc
= start
; loc
<=end
; loc
++) {
3194 op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
3195 opType
= URX_TYPE(op
);
3197 // The loop is advancing linearly through the pattern.
3198 // If the op we are now at was the destination of a branch in the pattern,
3199 // and that path has a shorter minimum length than the current accumulated value,
3200 // replace the current accumulated value.
3201 // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); // MinLength == INT32_MAX for some
3202 // no-match-possible cases.
3203 if (forwardedLength
.elementAti(loc
) < currentLen
) {
3204 currentLen
= forwardedLength
.elementAti(loc
);
3205 U_ASSERT(currentLen
>=0 && currentLen
< INT32_MAX
);
3209 // Ops that don't change the total length matched
3210 case URX_RESERVED_OP
:
3212 case URX_STRING_LEN
:
3214 case URX_START_CAPTURE
:
3215 case URX_END_CAPTURE
:
3216 case URX_BACKSLASH_B
:
3217 case URX_BACKSLASH_BU
:
3218 case URX_BACKSLASH_G
:
3219 case URX_BACKSLASH_Z
:
3225 case URX_RELOC_OPRND
:
3226 case URX_STO_INP_LOC
:
3228 case URX_CARET_M_UNIX
:
3229 case URX_BACKREF
: // BackRef. Must assume that it might be a zero length match
3232 case URX_STO_SP
: // Setup for atomic or possessive blocks. Doesn't change what can match.
3240 // Ops that match a minimum of one character (one or two 16 bit code units.)
3243 case URX_STATIC_SETREF
:
3244 case URX_STAT_SETREF_N
:
3246 case URX_BACKSLASH_D
:
3247 case URX_BACKSLASH_H
:
3248 case URX_BACKSLASH_R
:
3249 case URX_BACKSLASH_V
:
3251 case URX_BACKSLASH_X
: // Grahpeme Cluster. Minimum is 1, max unbounded.
3252 case URX_DOTANY_ALL
: // . matches one or two.
3254 case URX_DOTANY_UNIX
:
3260 loc
++; // URX_JMPX has an extra operand, ignored here,
3261 // otherwise processed identically to URX_JMP.
3265 int32_t jmpDest
= URX_VAL(op
);
3266 if (jmpDest
< loc
) {
3267 // Loop of some kind. Can safely ignore, the worst that will happen
3268 // is that we understate the true minimum length
3269 currentLen
= forwardedLength
.elementAti(loc
+1);
3271 // Forward jump. Propagate the current min length to the target loc of the jump.
3272 U_ASSERT(jmpDest
<= end
+1);
3273 if (forwardedLength
.elementAti(jmpDest
) > currentLen
) {
3274 forwardedLength
.setElementAt(currentLen
, jmpDest
);
3282 // Back-tracks are kind of like a branch, except that the min length was
3283 // propagated already, by the state save.
3284 currentLen
= forwardedLength
.elementAti(loc
+1);
3289 case URX_STATE_SAVE
:
3291 // State Save, for forward jumps, propagate the current minimum.
3292 // of the state save.
3293 int32_t jmpDest
= URX_VAL(op
);
3294 if (jmpDest
> loc
) {
3295 if (currentLen
< forwardedLength
.elementAti(jmpDest
)) {
3296 forwardedLength
.setElementAt(currentLen
, jmpDest
);
3306 int32_t stringLenOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
3307 currentLen
+= URX_VAL(stringLenOp
);
3315 // TODO: with full case folding, matching input text may be shorter than
3316 // the string we have here. More smarts could put some bounds on it.
3317 // Assume a min length of one for now. A min length of zero causes
3318 // optimization failures for a pattern like "string"+
3319 // currentLen += URX_VAL(stringLenOp);
3325 case URX_CTR_INIT_NG
:
3328 // If the min loop count == 0
3329 // move loc forwards to the end of the loop, skipping over the body.
3330 // If the min count is > 0,
3331 // continue normal processing of the body of the loop.
3332 int32_t loopEndLoc
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
+1);
3333 loopEndLoc
= URX_VAL(loopEndLoc
);
3334 int32_t minLoopCount
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
+2);
3335 if (minLoopCount
== 0) {
3338 loc
+=3; // Skips over operands of CTR_INIT
3345 case URX_CTR_LOOP_NG
:
3347 // The jump is conditional, backwards only.
3351 case URX_LOOP_DOT_I
:
3353 // More loop ops. These state-save to themselves.
3354 // don't change the minimum match - could match nothing at all.
3361 // Look-around. Scan forward until the matching look-ahead end,
3362 // without processing the look-around block. This is overly pessimistic for look-ahead,
3363 // it assumes that the look-ahead match might be zero-length.
3364 // TODO: Positive lookahead could recursively do the block, then continue
3365 // with the longer of the block or the value coming in. Ticket 6060
3366 int32_t depth
= (opType
== URX_LA_START
? 2: 1);;
3369 op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
3370 if (URX_TYPE(op
) == URX_LA_START
) {
3371 // The boilerplate for look-ahead includes two LA_END insturctions,
3372 // Depth will be decremented by each one when it is seen.
3375 if (URX_TYPE(op
) == URX_LB_START
) {
3378 if (URX_TYPE(op
) == URX_LA_END
) {
3384 if (URX_TYPE(op
)==URX_LBN_END
) {
3390 if (URX_TYPE(op
) == URX_STATE_SAVE
) {
3391 // Need this because neg lookahead blocks will FAIL to outside
3393 int32_t jmpDest
= URX_VAL(op
);
3394 if (jmpDest
> loc
) {
3395 if (currentLen
< forwardedLength
.elementAti(jmpDest
)) {
3396 forwardedLength
.setElementAt(currentLen
, jmpDest
);
3400 U_ASSERT(loc
<= end
);
3410 // Only come here if the matching URX_LA_START or URX_LB_START was not in the
3411 // range being sized, which happens when measuring size of look-behind blocks.
3420 // We have finished walking through the ops. Check whether some forward jump
3421 // propagated a shorter length to location end+1.
3422 if (forwardedLength
.elementAti(end
+1) < currentLen
) {
3423 currentLen
= forwardedLength
.elementAti(end
+1);
3424 U_ASSERT(currentLen
>=0 && currentLen
< INT32_MAX
);
3430 // Increment with overflow check.
3431 // val and delta will both be positive.
3433 static int32_t safeIncrement(int32_t val
, int32_t delta
) {
3434 if (INT32_MAX
- val
> delta
) {
3442 //------------------------------------------------------------------------------
3444 // maxMatchLength Calculate the length of the longest string that could
3445 // match the specified pattern.
3446 // Length is in 16 bit code units, not code points.
3448 // The calculated length may not be exact. The returned
3449 // value may be longer than the actual maximum; it must
3450 // never be shorter.
3452 //------------------------------------------------------------------------------
3453 int32_t RegexCompile::maxMatchLength(int32_t start
, int32_t end
) {
3454 if (U_FAILURE(*fStatus
)) {
3457 U_ASSERT(start
<= end
);
3458 U_ASSERT(end
< fRXPat
->fCompiledPat
->size());
3464 int32_t currentLen
= 0;
3465 UVector32
forwardedLength(end
+1, *fStatus
);
3466 forwardedLength
.setSize(end
+1);
3468 for (loc
=start
; loc
<=end
; loc
++) {
3469 forwardedLength
.setElementAt(0, loc
);
3472 for (loc
= start
; loc
<=end
; loc
++) {
3473 op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
3474 opType
= URX_TYPE(op
);
3476 // The loop is advancing linearly through the pattern.
3477 // If the op we are now at was the destination of a branch in the pattern,
3478 // and that path has a longer maximum length than the current accumulated value,
3479 // replace the current accumulated value.
3480 if (forwardedLength
.elementAti(loc
) > currentLen
) {
3481 currentLen
= forwardedLength
.elementAti(loc
);
3485 // Ops that don't change the total length matched
3486 case URX_RESERVED_OP
:
3488 case URX_STRING_LEN
:
3490 case URX_START_CAPTURE
:
3491 case URX_END_CAPTURE
:
3492 case URX_BACKSLASH_B
:
3493 case URX_BACKSLASH_BU
:
3494 case URX_BACKSLASH_G
:
3495 case URX_BACKSLASH_Z
:
3501 case URX_RELOC_OPRND
:
3502 case URX_STO_INP_LOC
:
3504 case URX_CARET_M_UNIX
:
3506 case URX_STO_SP
: // Setup for atomic or possessive blocks. Doesn't change what can match.
3516 // Ops that increase that cause an unbounded increase in the length
3517 // of a matched string, or that increase it a hard to characterize way.
3518 // Call the max length unbounded, and stop further checking.
3519 case URX_BACKREF
: // BackRef. Must assume that it might be a zero length match
3521 case URX_BACKSLASH_X
: // Grahpeme Cluster. Minimum is 1, max unbounded.
3522 currentLen
= INT32_MAX
;
3526 // Ops that match a max of one character (possibly two 16 bit code units.)
3528 case URX_STATIC_SETREF
:
3529 case URX_STAT_SETREF_N
:
3531 case URX_BACKSLASH_D
:
3532 case URX_BACKSLASH_H
:
3533 case URX_BACKSLASH_R
:
3534 case URX_BACKSLASH_V
:
3536 case URX_DOTANY_ALL
:
3538 case URX_DOTANY_UNIX
:
3539 currentLen
= safeIncrement(currentLen
, 2);
3542 // Single literal character. Increase current max length by one or two,
3543 // depending on whether the char is in the supplementary range.
3545 currentLen
= safeIncrement(currentLen
, 1);
3546 if (URX_VAL(op
) > 0x10000) {
3547 currentLen
= safeIncrement(currentLen
, 1);
3558 int32_t jmpDest
= URX_VAL(op
);
3559 if (jmpDest
< loc
) {
3560 // Loop of some kind. Max match length is unbounded.
3561 currentLen
= INT32_MAX
;
3563 // Forward jump. Propagate the current min length to the target loc of the jump.
3564 if (forwardedLength
.elementAti(jmpDest
) < currentLen
) {
3565 forwardedLength
.setElementAt(currentLen
, jmpDest
);
3573 // back-tracks are kind of like a branch, except that the max length was
3574 // propagated already, by the state save.
3575 currentLen
= forwardedLength
.elementAti(loc
+1);
3579 case URX_STATE_SAVE
:
3581 // State Save, for forward jumps, propagate the current minimum.
3582 // of the state save.
3583 // For backwards jumps, they create a loop, maximum
3584 // match length is unbounded.
3585 int32_t jmpDest
= URX_VAL(op
);
3586 if (jmpDest
> loc
) {
3587 if (currentLen
> forwardedLength
.elementAti(jmpDest
)) {
3588 forwardedLength
.setElementAt(currentLen
, jmpDest
);
3591 currentLen
= INT32_MAX
;
3602 int32_t stringLenOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
3603 currentLen
= safeIncrement(currentLen
, URX_VAL(stringLenOp
));
3608 // TODO: This code assumes that any user string that matches will be no longer
3609 // than our compiled string, with case insensitive matching.
3610 // Our compiled string has been case-folded already.
3612 // Any matching user string will have no more code points than our
3613 // compiled (folded) string. Folding may add code points, but
3616 // There is a potential problem if a supplemental code point
3617 // case-folds to a BMP code point. In this case our compiled string
3618 // could be shorter (in code units) than a matching user string.
3620 // At this time (Unicode 6.1) there are no such characters, and this case
3621 // is not being handled. A test, intltest regex/Bug9283, will fail if
3622 // any problematic characters are added to Unicode.
3624 // If this happens, we can make a set of the BMP chars that the
3625 // troublesome supplementals fold to, scan our string, and bump the
3626 // currentLen one extra for each that is found.
3630 int32_t stringLenOp
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
3631 currentLen
= safeIncrement(currentLen
, URX_VAL(stringLenOp
));
3636 case URX_CTR_INIT_NG
:
3637 // For Loops, recursively call this function on the pattern for the loop body,
3638 // then multiply the result by the maximum loop count.
3640 int32_t loopEndLoc
= URX_VAL(fRXPat
->fCompiledPat
->elementAti(loc
+1));
3641 if (loopEndLoc
== loc
+4) {
3642 // Loop has an empty body. No affect on max match length.
3643 // Continue processing with code after the loop end.
3648 int32_t maxLoopCount
= static_cast<int32_t>(fRXPat
->fCompiledPat
->elementAti(loc
+3));
3649 if (maxLoopCount
== -1) {
3650 // Unbounded Loop. No upper bound on match length.
3651 currentLen
= INT32_MAX
;
3655 U_ASSERT(loopEndLoc
>= loc
+4);
3656 int64_t blockLen
= maxMatchLength(loc
+4, loopEndLoc
-1); // Recursive call.
3657 int64_t updatedLen
= (int64_t)currentLen
+ blockLen
* maxLoopCount
;
3658 if (updatedLen
>= INT32_MAX
) {
3659 currentLen
= INT32_MAX
;
3662 currentLen
= (int32_t)updatedLen
;
3668 case URX_CTR_LOOP_NG
:
3669 // These opcodes will be skipped over by code for URX_CRT_INIT.
3670 // We shouldn't encounter them here.
3675 case URX_LOOP_DOT_I
:
3677 // For anything to do with loops, make the match length unbounded.
3678 currentLen
= INT32_MAX
;
3685 // Look-ahead. Just ignore, treat the look-ahead block as if
3686 // it were normal pattern. Gives a too-long match length,
3687 // but good enough for now.
3690 // End of look-ahead ops should always be consumed by the processing at
3691 // the URX_LA_START op.
3697 // Look-behind. Scan forward until the matching look-around end,
3698 // without processing the look-behind block.
3702 op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
3703 if (URX_TYPE(op
) == URX_LA_START
|| URX_TYPE(op
) == URX_LB_START
) {
3706 if (URX_TYPE(op
) == URX_LA_END
|| URX_TYPE(op
)==URX_LBN_END
) {
3712 U_ASSERT(loc
< end
);
3722 if (currentLen
== INT32_MAX
) {
3723 // The maximum length is unbounded.
3724 // Stop further processing of the pattern.
3734 //------------------------------------------------------------------------------
3736 // stripNOPs Remove any NOP operations from the compiled pattern code.
3737 // Extra NOPs are inserted for some constructs during the initial
3738 // code generation to provide locations that may be patched later.
3739 // Many end up unneeded, and are removed by this function.
3741 // In order to minimize the number of passes through the pattern,
3742 // back-reference fixup is also performed here (adjusting
3743 // back-reference operands to point to the correct frame offsets).
3745 //------------------------------------------------------------------------------
3746 void RegexCompile::stripNOPs() {
3748 if (U_FAILURE(*fStatus
)) {
3752 int32_t end
= fRXPat
->fCompiledPat
->size();
3753 UVector32
deltas(end
, *fStatus
);
3755 // Make a first pass over the code, computing the amount that things
3756 // will be offset at each location in the original code.
3759 for (loc
=0; loc
<end
; loc
++) {
3760 deltas
.addElement(d
, *fStatus
);
3761 int32_t op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(loc
);
3762 if (URX_TYPE(op
) == URX_NOP
) {
3767 UnicodeString caseStringBuffer
;
3769 // Make a second pass over the code, removing the NOPs by moving following
3770 // code up, and patching operands that refer to code locations that
3771 // are being moved. The array of offsets from the first step is used
3772 // to compute the new operand values.
3775 for (src
=0; src
<end
; src
++) {
3776 int32_t op
= (int32_t)fRXPat
->fCompiledPat
->elementAti(src
);
3777 int32_t opType
= URX_TYPE(op
);
3782 case URX_STATE_SAVE
:
3785 case URX_CTR_LOOP_NG
:
3786 case URX_RELOC_OPRND
:
3790 // These are instructions with operands that refer to code locations.
3792 int32_t operandAddress
= URX_VAL(op
);
3793 U_ASSERT(operandAddress
>=0 && operandAddress
<deltas
.size());
3794 int32_t fixedOperandAddress
= operandAddress
- deltas
.elementAti(operandAddress
);
3795 op
= buildOp(opType
, fixedOperandAddress
);
3796 fRXPat
->fCompiledPat
->setElementAt(op
, dst
);
3804 int32_t where
= URX_VAL(op
);
3805 if (where
> fRXPat
->fGroupMap
->size()) {
3806 error(U_REGEX_INVALID_BACK_REF
);
3809 where
= fRXPat
->fGroupMap
->elementAti(where
-1);
3810 op
= buildOp(opType
, where
);
3811 fRXPat
->fCompiledPat
->setElementAt(op
, dst
);
3814 fRXPat
->fNeedsAltInput
= TRUE
;
3817 case URX_RESERVED_OP
:
3818 case URX_RESERVED_OP_N
:
3823 case URX_STRING_LEN
:
3824 case URX_START_CAPTURE
:
3825 case URX_END_CAPTURE
:
3826 case URX_STATIC_SETREF
:
3827 case URX_STAT_SETREF_N
:
3831 case URX_BACKSLASH_B
:
3832 case URX_BACKSLASH_BU
:
3833 case URX_BACKSLASH_G
:
3834 case URX_BACKSLASH_X
:
3835 case URX_BACKSLASH_Z
:
3836 case URX_DOTANY_ALL
:
3837 case URX_BACKSLASH_D
:
3841 case URX_CTR_INIT_NG
:
3842 case URX_DOTANY_UNIX
:
3845 case URX_STO_INP_LOC
:
3852 case URX_CARET_M_UNIX
:
3859 case URX_LOOP_DOT_I
:
3863 case URX_BACKSLASH_H
:
3864 case URX_BACKSLASH_R
:
3865 case URX_BACKSLASH_V
:
3866 // These instructions are unaltered by the relocation.
3867 fRXPat
->fCompiledPat
->setElementAt(op
, dst
);
3872 // Some op is unaccounted for.
3874 error(U_REGEX_INTERNAL_ERROR
);
3878 fRXPat
->fCompiledPat
->setSize(dst
);
3884 //------------------------------------------------------------------------------
3886 // Error Report a rule parse error.
3887 // Only report it if no previous error has been recorded.
3889 //------------------------------------------------------------------------------
3890 void RegexCompile::error(UErrorCode e
) {
3891 if (U_SUCCESS(*fStatus
)) {
3893 // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
3894 // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
3895 // int64_t. If the values of the latter are out of range for the former,
3896 // set them to the appropriate "field not supported" values.
3897 if (fLineNum
> 0x7FFFFFFF) {
3898 fParseErr
->line
= 0;
3899 fParseErr
->offset
= -1;
3900 } else if (fCharNum
> 0x7FFFFFFF) {
3901 fParseErr
->line
= (int32_t)fLineNum
;
3902 fParseErr
->offset
= -1;
3904 fParseErr
->line
= (int32_t)fLineNum
;
3905 fParseErr
->offset
= (int32_t)fCharNum
;
3908 UErrorCode status
= U_ZERO_ERROR
; // throwaway status for extracting context
3910 // Fill in the context.
3911 // Note: extractBetween() pins supplied indicies to the string bounds.
3912 uprv_memset(fParseErr
->preContext
, 0, sizeof(fParseErr
->preContext
));
3913 uprv_memset(fParseErr
->postContext
, 0, sizeof(fParseErr
->postContext
));
3914 utext_extract(fRXPat
->fPattern
, fScanIndex
-U_PARSE_CONTEXT_LEN
+1, fScanIndex
, fParseErr
->preContext
, U_PARSE_CONTEXT_LEN
, &status
);
3915 utext_extract(fRXPat
->fPattern
, fScanIndex
, fScanIndex
+U_PARSE_CONTEXT_LEN
-1, fParseErr
->postContext
, U_PARSE_CONTEXT_LEN
, &status
);
3921 // Assorted Unicode character constants.
3922 // Numeric because there is no portable way to enter them as literals.
3925 static const UChar chCR
= 0x0d; // New lines, for terminating comments.
3926 static const UChar chLF
= 0x0a; // Line Feed
3927 static const UChar chPound
= 0x23; // '#', introduces a comment.
3928 static const UChar chDigit0
= 0x30; // '0'
3929 static const UChar chDigit7
= 0x37; // '9'
3930 static const UChar chColon
= 0x3A; // ':'
3931 static const UChar chE
= 0x45; // 'E'
3932 static const UChar chQ
= 0x51; // 'Q'
3933 //static const UChar chN = 0x4E; // 'N'
3934 static const UChar chP
= 0x50; // 'P'
3935 static const UChar chBackSlash
= 0x5c; // '\' introduces a char escape
3936 //static const UChar chLBracket = 0x5b; // '['
3937 static const UChar chRBracket
= 0x5d; // ']'
3938 static const UChar chUp
= 0x5e; // '^'
3939 static const UChar chLowerP
= 0x70;
3940 static const UChar chLBrace
= 0x7b; // '{'
3941 static const UChar chRBrace
= 0x7d; // '}'
3942 static const UChar chNEL
= 0x85; // NEL newline variant
3943 static const UChar chLS
= 0x2028; // Unicode Line Separator
3946 //------------------------------------------------------------------------------
3948 // nextCharLL Low Level Next Char from the regex pattern.
3949 // Get a char from the string, keep track of input position
3950 // for error reporting.
3952 //------------------------------------------------------------------------------
3953 UChar32
RegexCompile::nextCharLL() {
3956 if (fPeekChar
!= -1) {
3962 // assume we're already in the right place
3963 ch
= UTEXT_NEXT32(fRXPat
->fPattern
);
3964 if (ch
== U_SENTINEL
) {
3971 (ch
== chLF
&& fLastChar
!= chCR
)) {
3972 // Character is starting a new line. Bump up the line number, and
3973 // reset the column to 0.
3978 // Character is not starting a new line. Except in the case of a
3979 // LF following a CR, increment the column position.
3988 //------------------------------------------------------------------------------
3990 // peekCharLL Low Level Character Scanning, sneak a peek at the next
3991 // character without actually getting it.
3993 //------------------------------------------------------------------------------
3994 UChar32
RegexCompile::peekCharLL() {
3995 if (fPeekChar
== -1) {
3996 fPeekChar
= nextCharLL();
4002 //------------------------------------------------------------------------------
4004 // nextChar for pattern scanning. At this level, we handle stripping
4005 // out comments and processing some backslash character escapes.
4006 // The rest of the pattern grammar is handled at the next level up.
4008 //------------------------------------------------------------------------------
4009 void RegexCompile::nextChar(RegexPatternChar
&c
) {
4011 fScanIndex
= UTEXT_GETNATIVEINDEX(fRXPat
->fPattern
);
4012 c
.fChar
= nextCharLL();
4017 if ((c
.fChar
==chBackSlash
&& peekCharLL()==chE
&& ((fModeFlags
& UREGEX_LITERAL
) == 0)) ||
4018 c
.fChar
== (UChar32
)-1) {
4019 fQuoteMode
= FALSE
; // Exit quote mode,
4020 nextCharLL(); // discard the E
4021 nextChar(c
); // recurse to get the real next char
4024 else if (fInBackslashQuote
) {
4025 // The current character immediately follows a '\'
4026 // Don't check for any further escapes, just return it as-is.
4027 // Don't set c.fQuoted, because that would prevent the state machine from
4028 // dispatching on the character.
4029 fInBackslashQuote
= FALSE
;
4033 // We are not in a \Q quoted region \E of the source.
4035 if (fModeFlags
& UREGEX_COMMENTS
) {
4037 // We are in free-spacing and comments mode.
4038 // Scan through any white space and comments, until we
4039 // reach a significant character or the end of inut.
4041 if (c
.fChar
== (UChar32
)-1) {
4042 break; // End of Input
4044 if (c
.fChar
== chPound
&& fEOLComments
== TRUE
) {
4045 // Start of a comment. Consume the rest of it, until EOF or a new line
4047 c
.fChar
= nextCharLL();
4048 if (c
.fChar
== (UChar32
)-1 || // EOF
4057 // TODO: check what Java & Perl do with non-ASCII white spaces. Ticket 6061.
4058 if (PatternProps::isWhiteSpace(c
.fChar
) == FALSE
) {
4061 c
.fChar
= nextCharLL();
4066 // check for backslash escaped characters.
4068 if (c
.fChar
== chBackSlash
) {
4069 int64_t pos
= UTEXT_GETNATIVEINDEX(fRXPat
->fPattern
);
4070 if (RegexStaticSets::gStaticSets
->fUnescapeCharSet
.contains(peekCharLL())) {
4072 // A '\' sequence that is handled by ICU's standard unescapeAt function.
4073 // Includes \uxxxx, \n, \r, many others.
4074 // Return the single equivalent character.
4076 nextCharLL(); // get & discard the peeked char.
4079 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat
->fPattern
, fPatternLength
)) {
4080 int32_t endIndex
= (int32_t)pos
;
4081 c
.fChar
= u_unescapeAt(uregex_ucstr_unescape_charAt
, &endIndex
, (int32_t)fPatternLength
, (void *)fRXPat
->fPattern
->chunkContents
);
4083 if (endIndex
== pos
) {
4084 error(U_REGEX_BAD_ESCAPE_SEQUENCE
);
4086 fCharNum
+= endIndex
- pos
;
4087 UTEXT_SETNATIVEINDEX(fRXPat
->fPattern
, endIndex
);
4090 struct URegexUTextUnescapeCharContext context
= U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat
->fPattern
);
4092 UTEXT_SETNATIVEINDEX(fRXPat
->fPattern
, pos
);
4093 c
.fChar
= u_unescapeAt(uregex_utext_unescape_charAt
, &offset
, INT32_MAX
, &context
);
4096 error(U_REGEX_BAD_ESCAPE_SEQUENCE
);
4097 } else if (context
.lastOffset
== offset
) {
4098 UTEXT_PREVIOUS32(fRXPat
->fPattern
);
4099 } else if (context
.lastOffset
!= offset
-1) {
4100 utext_moveIndex32(fRXPat
->fPattern
, offset
- context
.lastOffset
- 1);
4105 else if (peekCharLL() == chDigit0
) {
4106 // Octal Escape, using Java Regexp Conventions
4107 // which are \0 followed by 1-3 octal digits.
4108 // Different from ICU Unescape handling of Octal, which does not
4109 // require the leading 0.
4110 // Java also has the convention of only consuming 2 octal digits if
4111 // the three digit number would be > 0xff
4114 nextCharLL(); // Consume the initial 0.
4116 for (index
=0; index
<3; index
++) {
4117 int32_t ch
= peekCharLL();
4118 if (ch
<chDigit0
|| ch
>chDigit7
) {
4120 // \0 is not followed by any octal digits.
4121 error(U_REGEX_BAD_ESCAPE_SEQUENCE
);
4127 if (c
.fChar
<= 255) {
4130 // The last digit made the number too big. Forget we saw it.
4136 else if (peekCharLL() == chQ
) {
4137 // "\Q" enter quote mode, which will continue until "\E"
4139 nextCharLL(); // discard the 'Q'.
4140 nextChar(c
); // recurse to get the real next char.
4144 // We are in a '\' escape that will be handled by the state table scanner.
4145 // Just return the backslash, but remember that the following char is to
4146 // be taken literally.
4147 fInBackslashQuote
= TRUE
;
4152 // re-enable # to end-of-line comments, in case they were disabled.
4153 // They are disabled by the parser upon seeing '(?', but this lasts for
4154 // the fetching of the next character only.
4155 fEOLComments
= TRUE
;
4157 // putc(c.fChar, stdout);
4162 //------------------------------------------------------------------------------
4165 // Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
4167 // The scan position will be at the 'N'. On return
4168 // the scan position should be just after the '}'
4170 // Return the UChar32
4172 //------------------------------------------------------------------------------
4173 UChar32
RegexCompile::scanNamedChar() {
4174 if (U_FAILURE(*fStatus
)) {
4179 if (fC
.fChar
!= chLBrace
) {
4180 error(U_REGEX_PROPERTY_SYNTAX
);
4184 UnicodeString charName
;
4187 if (fC
.fChar
== chRBrace
) {
4190 if (fC
.fChar
== -1) {
4191 error(U_REGEX_PROPERTY_SYNTAX
);
4194 charName
.append(fC
.fChar
);
4198 if (!uprv_isInvariantUString(charName
.getBuffer(), charName
.length()) ||
4199 (uint32_t)charName
.length()>=sizeof(name
)) {
4200 // All Unicode character names have only invariant characters.
4201 // The API to get a character, given a name, accepts only char *, forcing us to convert,
4202 // which requires this error check
4203 error(U_REGEX_PROPERTY_SYNTAX
);
4206 charName
.extract(0, charName
.length(), name
, sizeof(name
), US_INV
);
4208 UChar32 theChar
= u_charFromName(U_UNICODE_CHAR_NAME
, name
, fStatus
);
4209 if (U_FAILURE(*fStatus
)) {
4210 error(U_REGEX_PROPERTY_SYNTAX
);
4213 nextChar(fC
); // Continue overall regex pattern processing with char after the '}'
4217 //------------------------------------------------------------------------------
4219 // scanProp Construct a UnicodeSet from the text at the current scan
4220 // position, which will be of the form \p{whaterver}
4222 // The scan position will be at the 'p' or 'P'. On return
4223 // the scan position should be just after the '}'
4225 // Return a UnicodeSet, constructed from the \P pattern,
4226 // or NULL if the pattern is invalid.
4228 //------------------------------------------------------------------------------
4229 UnicodeSet
*RegexCompile::scanProp() {
4230 UnicodeSet
*uset
= NULL
;
4232 if (U_FAILURE(*fStatus
)) {
4235 (void)chLowerP
; // Suppress compiler unused variable warning.
4236 U_ASSERT(fC
.fChar
== chLowerP
|| fC
.fChar
== chP
);
4237 UBool negated
= (fC
.fChar
== chP
);
4239 UnicodeString propertyName
;
4241 if (fC
.fChar
!= chLBrace
) {
4242 error(U_REGEX_PROPERTY_SYNTAX
);
4247 if (fC
.fChar
== chRBrace
) {
4250 if (fC
.fChar
== -1) {
4251 // Hit the end of the input string without finding the closing '}'
4252 error(U_REGEX_PROPERTY_SYNTAX
);
4255 propertyName
.append(fC
.fChar
);
4257 uset
= createSetForProperty(propertyName
, negated
);
4258 nextChar(fC
); // Move input scan to position following the closing '}'
4262 //------------------------------------------------------------------------------
4264 // scanPosixProp Construct a UnicodeSet from the text at the current scan
4265 // position, which is expected be of the form [:property expression:]
4267 // The scan position will be at the opening ':'. On return
4268 // the scan position must be on the closing ']'
4270 // Return a UnicodeSet constructed from the pattern,
4271 // or NULL if this is not a valid POSIX-style set expression.
4272 // If not a property expression, restore the initial scan position
4273 // (to the opening ':')
4275 // Note: the opening '[:' is not sufficient to guarantee that
4276 // this is a [:property:] expression.
4277 // [:'+=,] is a perfectly good ordinary set expression that
4278 // happens to include ':' as one of its characters.
4280 //------------------------------------------------------------------------------
4281 UnicodeSet
*RegexCompile::scanPosixProp() {
4282 UnicodeSet
*uset
= NULL
;
4284 if (U_FAILURE(*fStatus
)) {
4288 U_ASSERT(fC
.fChar
== chColon
);
4290 // Save the scanner state.
4291 // TODO: move this into the scanner, with the state encapsulated in some way. Ticket 6062
4292 int64_t savedScanIndex
= fScanIndex
;
4293 int64_t savedNextIndex
= UTEXT_GETNATIVEINDEX(fRXPat
->fPattern
);
4294 UBool savedQuoteMode
= fQuoteMode
;
4295 UBool savedInBackslashQuote
= fInBackslashQuote
;
4296 UBool savedEOLComments
= fEOLComments
;
4297 int64_t savedLineNum
= fLineNum
;
4298 int64_t savedCharNum
= fCharNum
;
4299 UChar32 savedLastChar
= fLastChar
;
4300 UChar32 savedPeekChar
= fPeekChar
;
4301 RegexPatternChar savedfC
= fC
;
4303 // Scan for a closing ]. A little tricky because there are some perverse
4304 // edge cases possible. "[:abc\Qdef:] \E]" is a valid non-property expression,
4305 // ending on the second closing ].
4307 UnicodeString propName
;
4308 UBool negated
= FALSE
;
4310 // Check for and consume the '^' in a negated POSIX property, e.g. [:^Letter:]
4312 if (fC
.fChar
== chUp
) {
4317 // Scan for the closing ":]", collecting the property name along the way.
4318 UBool sawPropSetTerminator
= FALSE
;
4320 propName
.append(fC
.fChar
);
4322 if (fC
.fQuoted
|| fC
.fChar
== -1) {
4323 // Escaped characters or end of input - either says this isn't a [:Property:]
4326 if (fC
.fChar
== chColon
) {
4328 if (fC
.fChar
== chRBracket
) {
4329 sawPropSetTerminator
= TRUE
;
4335 if (sawPropSetTerminator
) {
4336 uset
= createSetForProperty(propName
, negated
);
4341 // Restore the original scan position.
4342 // The main scanner will retry the input as a normal set expression,
4343 // not a [:Property:] expression.
4344 fScanIndex
= savedScanIndex
;
4345 fQuoteMode
= savedQuoteMode
;
4346 fInBackslashQuote
= savedInBackslashQuote
;
4347 fEOLComments
= savedEOLComments
;
4348 fLineNum
= savedLineNum
;
4349 fCharNum
= savedCharNum
;
4350 fLastChar
= savedLastChar
;
4351 fPeekChar
= savedPeekChar
;
4353 UTEXT_SETNATIVEINDEX(fRXPat
->fPattern
, savedNextIndex
);
4358 static inline void addIdentifierIgnorable(UnicodeSet
*set
, UErrorCode
& ec
) {
4359 set
->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
4360 addCategory(set
, U_GC_CF_MASK
, ec
);
4364 // Create a Unicode Set from a Unicode Property expression.
4365 // This is common code underlying both \p{...} ane [:...:] expressions.
4366 // Includes trying the Java "properties" that aren't supported as
4367 // normal ICU UnicodeSet properties
4369 static const UChar posSetPrefix
[] = {0x5b, 0x5c, 0x70, 0x7b, 0}; // "[\p{"
4370 static const UChar negSetPrefix
[] = {0x5b, 0x5c, 0x50, 0x7b, 0}; // "[\P{"
4371 UnicodeSet
*RegexCompile::createSetForProperty(const UnicodeString
&propName
, UBool negated
) {
4372 UnicodeString setExpr
;
4374 uint32_t usetFlags
= 0;
4376 if (U_FAILURE(*fStatus
)) {
4381 // First try the property as we received it
4384 setExpr
.append(negSetPrefix
, -1);
4386 setExpr
.append(posSetPrefix
, -1);
4388 setExpr
.append(propName
);
4389 setExpr
.append(chRBrace
);
4390 setExpr
.append(chRBracket
);
4391 if (fModeFlags
& UREGEX_CASE_INSENSITIVE
) {
4392 usetFlags
|= USET_CASE_INSENSITIVE
;
4394 set
= new UnicodeSet(setExpr
, usetFlags
, NULL
, *fStatus
);
4395 if (U_SUCCESS(*fStatus
)) {
4402 // The property as it was didn't work.
4404 // Do [:word:]. It is not recognized as a property by UnicodeSet. "word" not standard POSIX
4405 // or standard Java, but many other regular expression packages do recognize it.
4407 if (propName
.caseCompare(UNICODE_STRING_SIMPLE("word"), 0) == 0) {
4408 *fStatus
= U_ZERO_ERROR
;
4409 set
= new UnicodeSet(*(fRXPat
->fStaticSets
[URX_ISWORD_SET
]));
4411 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
4422 // InGreek -> InGreek or Coptic, that being the official Unicode name for that block.
4423 // InCombiningMarksforSymbols -> InCombiningDiacriticalMarksforSymbols.
4425 // Note on Spaces: either "InCombiningMarksForSymbols" or "InCombining Marks for Symbols"
4426 // is accepted by Java. The property part of the name is compared
4427 // case-insenstively. The spaces must be exactly as shown, either
4428 // all there, or all omitted, with exactly one at each position
4429 // if they are present. From checking against JDK 1.6
4431 // This code should be removed when ICU properties support the Java compatibility names
4434 UnicodeString mPropName
= propName
;
4435 if (mPropName
.caseCompare(UNICODE_STRING_SIMPLE("InGreek"), 0) == 0) {
4436 mPropName
= UNICODE_STRING_SIMPLE("InGreek and Coptic");
4438 if (mPropName
.caseCompare(UNICODE_STRING_SIMPLE("InCombining Marks for Symbols"), 0) == 0 ||
4439 mPropName
.caseCompare(UNICODE_STRING_SIMPLE("InCombiningMarksforSymbols"), 0) == 0) {
4440 mPropName
= UNICODE_STRING_SIMPLE("InCombining Diacritical Marks for Symbols");
4442 else if (mPropName
.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
4443 mPropName
= UNICODE_STRING_SIMPLE("javaValidCodePoint");
4446 // See if the property looks like a Java "InBlockName", which
4447 // we will recast as "Block=BlockName"
4449 static const UChar IN
[] = {0x49, 0x6E, 0}; // "In"
4450 static const UChar BLOCK
[] = {0x42, 0x6C, 0x6f, 0x63, 0x6b, 0x3d, 00}; // "Block="
4451 if (mPropName
.startsWith(IN
, 2) && propName
.length()>=3) {
4452 setExpr
.truncate(4); // Leaves "[\p{", or "[\P{"
4453 setExpr
.append(BLOCK
, -1);
4454 setExpr
.append(UnicodeString(mPropName
, 2)); // Property with the leading "In" removed.
4455 setExpr
.append(chRBrace
);
4456 setExpr
.append(chRBracket
);
4457 *fStatus
= U_ZERO_ERROR
;
4458 set
= new UnicodeSet(setExpr
, usetFlags
, NULL
, *fStatus
);
4459 if (U_SUCCESS(*fStatus
)) {
4466 if (propName
.startsWith(UNICODE_STRING_SIMPLE("java")) ||
4467 propName
.compare(UNICODE_STRING_SIMPLE("all")) == 0)
4469 UErrorCode localStatus
= U_ZERO_ERROR
;
4471 set
= new UnicodeSet();
4473 // Try the various Java specific properties.
4474 // These all begin with "java"
4476 if (mPropName
.compare(UNICODE_STRING_SIMPLE("javaDefined")) == 0) {
4477 addCategory(set
, U_GC_CN_MASK
, localStatus
);
4480 else if (mPropName
.compare(UNICODE_STRING_SIMPLE("javaDigit")) == 0) {
4481 addCategory(set
, U_GC_ND_MASK
, localStatus
);
4483 else if (mPropName
.compare(UNICODE_STRING_SIMPLE("javaIdentifierIgnorable")) == 0) {
4484 addIdentifierIgnorable(set
, localStatus
);
4486 else if (mPropName
.compare(UNICODE_STRING_SIMPLE("javaISOControl")) == 0) {
4487 set
->add(0, 0x1F).add(0x7F, 0x9F);
4489 else if (mPropName
.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierPart")) == 0) {
4490 addCategory(set
, U_GC_L_MASK
, localStatus
);
4491 addCategory(set
, U_GC_SC_MASK
, localStatus
);
4492 addCategory(set
, U_GC_PC_MASK
, localStatus
);
4493 addCategory(set
, U_GC_ND_MASK
, localStatus
);
4494 addCategory(set
, U_GC_NL_MASK
, localStatus
);
4495 addCategory(set
, U_GC_MC_MASK
, localStatus
);
4496 addCategory(set
, U_GC_MN_MASK
, localStatus
);
4497 addIdentifierIgnorable(set
, localStatus
);
4499 else if (mPropName
.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierStart")) == 0) {
4500 addCategory(set
, U_GC_L_MASK
, localStatus
);
4501 addCategory(set
, U_GC_NL_MASK
, localStatus
);
4502 addCategory(set
, U_GC_SC_MASK
, localStatus
);
4503 addCategory(set
, U_GC_PC_MASK
, localStatus
);
4505 else if (mPropName
.compare(UNICODE_STRING_SIMPLE("javaLetter")) == 0) {
4506 addCategory(set
, U_GC_L_MASK
, localStatus
);
4508 else if (mPropName
.compare(UNICODE_STRING_SIMPLE("javaLetterOrDigit")) == 0) {
4509 addCategory(set
, U_GC_L_MASK
, localStatus
);
4510 addCategory(set
, U_GC_ND_MASK
, localStatus
);
4512 else if (mPropName
.compare(UNICODE_STRING_SIMPLE("javaLowerCase")) == 0) {
4513 addCategory(set
, U_GC_LL_MASK
, localStatus
);
4515 else if (mPropName
.compare(UNICODE_STRING_SIMPLE("javaMirrored")) == 0) {
4516 set
->applyIntPropertyValue(UCHAR_BIDI_MIRRORED
, 1, localStatus
);
4518 else if (mPropName
.compare(UNICODE_STRING_SIMPLE("javaSpaceChar")) == 0) {
4519 addCategory(set
, U_GC_Z_MASK
, localStatus
);
4521 else if (mPropName
.compare(UNICODE_STRING_SIMPLE("javaSupplementaryCodePoint")) == 0) {
4522 set
->add(0x10000, UnicodeSet::MAX_VALUE
);
4524 else if (mPropName
.compare(UNICODE_STRING_SIMPLE("javaTitleCase")) == 0) {
4525 addCategory(set
, U_GC_LT_MASK
, localStatus
);
4527 else if (mPropName
.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierStart")) == 0) {
4528 addCategory(set
, U_GC_L_MASK
, localStatus
);
4529 addCategory(set
, U_GC_NL_MASK
, localStatus
);
4531 else if (mPropName
.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierPart")) == 0) {
4532 addCategory(set
, U_GC_L_MASK
, localStatus
);
4533 addCategory(set
, U_GC_PC_MASK
, localStatus
);
4534 addCategory(set
, U_GC_ND_MASK
, localStatus
);
4535 addCategory(set
, U_GC_NL_MASK
, localStatus
);
4536 addCategory(set
, U_GC_MC_MASK
, localStatus
);
4537 addCategory(set
, U_GC_MN_MASK
, localStatus
);
4538 addIdentifierIgnorable(set
, localStatus
);
4540 else if (mPropName
.compare(UNICODE_STRING_SIMPLE("javaUpperCase")) == 0) {
4541 addCategory(set
, U_GC_LU_MASK
, localStatus
);
4543 else if (mPropName
.compare(UNICODE_STRING_SIMPLE("javaValidCodePoint")) == 0) {
4544 set
->add(0, UnicodeSet::MAX_VALUE
);
4546 else if (mPropName
.compare(UNICODE_STRING_SIMPLE("javaWhitespace")) == 0) {
4547 addCategory(set
, U_GC_Z_MASK
, localStatus
);
4548 set
->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
4549 set
->add(9, 0x0d).add(0x1c, 0x1f);
4551 else if (mPropName
.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
4552 set
->add(0, UnicodeSet::MAX_VALUE
);
4555 if (U_SUCCESS(localStatus
) && !set
->isEmpty()) {
4556 *fStatus
= U_ZERO_ERROR
;
4557 if (usetFlags
& USET_CASE_INSENSITIVE
) {
4558 set
->closeOver(USET_CASE_INSENSITIVE
);
4575 // SetEval Part of the evaluation of [set expressions].
4576 // Perform any pending (stacked) operations with precedence
4577 // equal or greater to that of the next operator encountered
4578 // in the expression.
4580 void RegexCompile::setEval(int32_t nextOp
) {
4581 UnicodeSet
*rightOperand
= NULL
;
4582 UnicodeSet
*leftOperand
= NULL
;
4584 U_ASSERT(fSetOpStack
.empty()==FALSE
);
4585 int32_t pendingSetOperation
= fSetOpStack
.peeki();
4586 if ((pendingSetOperation
&0xffff0000) < (nextOp
&0xffff0000)) {
4590 U_ASSERT(fSetStack
.empty() == FALSE
);
4591 rightOperand
= (UnicodeSet
*)fSetStack
.peek();
4592 switch (pendingSetOperation
) {
4594 rightOperand
->complement();
4597 // TODO: need a simple close function. Ticket 6065
4598 rightOperand
->closeOver(USET_CASE_INSENSITIVE
);
4599 rightOperand
->removeAllStrings();
4601 case setDifference1
:
4602 case setDifference2
:
4604 leftOperand
= (UnicodeSet
*)fSetStack
.peek();
4605 leftOperand
->removeAll(*rightOperand
);
4606 delete rightOperand
;
4608 case setIntersection1
:
4609 case setIntersection2
:
4611 leftOperand
= (UnicodeSet
*)fSetStack
.peek();
4612 leftOperand
->retainAll(*rightOperand
);
4613 delete rightOperand
;
4617 leftOperand
= (UnicodeSet
*)fSetStack
.peek();
4618 leftOperand
->addAll(*rightOperand
);
4619 delete rightOperand
;
4628 void RegexCompile::setPushOp(int32_t op
) {
4630 fSetOpStack
.push(op
, *fStatus
);
4631 fSetStack
.push(new UnicodeSet(), *fStatus
);
4635 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS