]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/regexcmp.cpp
ICU-57149.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / regexcmp.cpp
1 //
2 // file: regexcmp.cpp
3 //
4 // Copyright (C) 2002-2016 International Business Machines Corporation and others.
5 // All Rights Reserved.
6 //
7 // This file contains the ICU regular expression compiler, which is responsible
8 // for processing a regular expression pattern into the compiled form that
9 // is used by the match finding engine.
10 //
11
12 #include "unicode/utypes.h"
13
14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
15
16 #include "unicode/ustring.h"
17 #include "unicode/unistr.h"
18 #include "unicode/uniset.h"
19 #include "unicode/uchar.h"
20 #include "unicode/uchriter.h"
21 #include "unicode/parsepos.h"
22 #include "unicode/parseerr.h"
23 #include "unicode/regex.h"
24 #include "unicode/utf.h"
25 #include "unicode/utf16.h"
26 #include "patternprops.h"
27 #include "putilimp.h"
28 #include "cmemory.h"
29 #include "cstring.h"
30 #include "uvectr32.h"
31 #include "uvectr64.h"
32 #include "uassert.h"
33 #include "uinvchar.h"
34
35 #include "regeximp.h"
36 #include "regexcst.h" // Contains state table for the regex pattern parser.
37 // generated by a Perl script.
38 #include "regexcmp.h"
39 #include "regexst.h"
40 #include "regextxt.h"
41
42
43
44 U_NAMESPACE_BEGIN
45
46
47 //------------------------------------------------------------------------------
48 //
49 // Constructor.
50 //
51 //------------------------------------------------------------------------------
52 RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
53 fParenStack(status), fSetStack(status), fSetOpStack(status)
54 {
55 // Lazy init of all shared global sets (needed for init()'s empty text)
56 RegexStaticSets::initGlobals(&status);
57
58 fStatus = &status;
59
60 fRXPat = rxp;
61 fScanIndex = 0;
62 fLastChar = -1;
63 fPeekChar = -1;
64 fLineNum = 1;
65 fCharNum = 0;
66 fQuoteMode = FALSE;
67 fInBackslashQuote = FALSE;
68 fModeFlags = fRXPat->fFlags | 0x80000000;
69 fEOLComments = TRUE;
70
71 fMatchOpenParen = -1;
72 fMatchCloseParen = -1;
73 fCaptureName = NULL;
74 fLastSetLiteral = U_SENTINEL;
75
76 if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
77 status = rxp->fDeferredStatus;
78 }
79 }
80
81 static const UChar chAmp = 0x26; // '&'
82 static const UChar chDash = 0x2d; // '-'
83
84
85 //------------------------------------------------------------------------------
86 //
87 // Destructor
88 //
89 //------------------------------------------------------------------------------
90 RegexCompile::~RegexCompile() {
91 delete fCaptureName; // Normally will be NULL, but can exist if pattern
92 // compilation stops with a syntax error.
93 }
94
95 static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
96 set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec));
97 }
98
99 //------------------------------------------------------------------------------
100 //
101 // Compile regex pattern. The state machine for rexexp pattern parsing is here.
102 // The state tables are hand-written in the file regexcst.txt,
103 // and converted to the form used here by a perl
104 // script regexcst.pl
105 //
106 //------------------------------------------------------------------------------
107 void RegexCompile::compile(
108 const UnicodeString &pat, // Source pat to be compiled.
109 UParseError &pp, // Error position info
110 UErrorCode &e) // Error Code
111 {
112 fRXPat->fPatternString = new UnicodeString(pat);
113 UText patternText = UTEXT_INITIALIZER;
114 utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e);
115
116 if (U_SUCCESS(e)) {
117 compile(&patternText, pp, e);
118 utext_close(&patternText);
119 }
120 }
121
122 //
123 // compile, UText mode
124 // All the work is actually done here.
125 //
126 void RegexCompile::compile(
127 UText *pat, // Source pat to be compiled.
128 UParseError &pp, // Error position info
129 UErrorCode &e) // Error Code
130 {
131 fStatus = &e;
132 fParseErr = &pp;
133 fStackPtr = 0;
134 fStack[fStackPtr] = 0;
135
136 if (U_FAILURE(*fStatus)) {
137 return;
138 }
139
140 // There should be no pattern stuff in the RegexPattern object. They can not be reused.
141 U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) == 0);
142
143 // Prepare the RegexPattern object to receive the compiled pattern.
144 fRXPat->fPattern = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus);
145 if (U_FAILURE(*fStatus)) {
146 return;
147 }
148 fRXPat->fStaticSets = RegexStaticSets::gStaticSets->fPropSets;
149 fRXPat->fStaticSets8 = RegexStaticSets::gStaticSets->fPropSets8;
150
151
152 // Initialize the pattern scanning state machine
153 fPatternLength = utext_nativeLength(pat);
154 uint16_t state = 1;
155 const RegexTableEl *tableEl;
156
157 // UREGEX_LITERAL force entire pattern to be treated as a literal string.
158 if (fModeFlags & UREGEX_LITERAL) {
159 fQuoteMode = TRUE;
160 }
161
162 nextChar(fC); // Fetch the first char from the pattern string.
163
164 //
165 // Main loop for the regex pattern parsing state machine.
166 // Runs once per state transition.
167 // Each time through optionally performs, depending on the state table,
168 // - an advance to the the next pattern char
169 // - an action to be performed.
170 // - pushing or popping a state to/from the local state return stack.
171 // file regexcst.txt is the source for the state table. The logic behind
172 // recongizing the pattern syntax is there, not here.
173 //
174 for (;;) {
175 // Bail out if anything has gone wrong.
176 // Regex pattern parsing stops on the first error encountered.
177 if (U_FAILURE(*fStatus)) {
178 break;
179 }
180
181 U_ASSERT(state != 0);
182
183 // Find the state table element that matches the input char from the pattern, or the
184 // class of the input character. Start with the first table row for this
185 // state, then linearly scan forward until we find a row that matches the
186 // character. The last row for each state always matches all characters, so
187 // the search will stop there, if not before.
188 //
189 tableEl = &gRuleParseStateTable[state];
190 REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d) state=%s ",
191 fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
192
193 for (;;) { // loop through table rows belonging to this state, looking for one
194 // that matches the current input char.
195 REGEX_SCAN_DEBUG_PRINTF(("."));
196 if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE && tableEl->fCharClass == fC.fChar) {
197 // Table row specified an individual character, not a set, and
198 // the input character is not quoted, and
199 // the input character matched it.
200 break;
201 }
202 if (tableEl->fCharClass == 255) {
203 // Table row specified default, match anything character class.
204 break;
205 }
206 if (tableEl->fCharClass == 254 && fC.fQuoted) {
207 // Table row specified "quoted" and the char was quoted.
208 break;
209 }
210 if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1) {
211 // Table row specified eof and we hit eof on the input.
212 break;
213 }
214
215 if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class &&
216 fC.fQuoted == FALSE && // char is not escaped &&
217 fC.fChar != (UChar32)-1) { // char is not EOF
218 U_ASSERT(tableEl->fCharClass <= 137);
219 if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
220 // Table row specified a character class, or set of characters,
221 // and the current char matches it.
222 break;
223 }
224 }
225
226 // No match on this row, advance to the next row for this state,
227 tableEl++;
228 }
229 REGEX_SCAN_DEBUG_PRINTF(("\n"));
230
231 //
232 // We've found the row of the state table that matches the current input
233 // character from the rules string.
234 // Perform any action specified by this row in the state table.
235 if (doParseActions(tableEl->fAction) == FALSE) {
236 // Break out of the state machine loop if the
237 // the action signalled some kind of error, or
238 // the action was to exit, occurs on normal end-of-rules-input.
239 break;
240 }
241
242 if (tableEl->fPushState != 0) {
243 fStackPtr++;
244 if (fStackPtr >= kStackSize) {
245 error(U_REGEX_INTERNAL_ERROR);
246 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
247 fStackPtr--;
248 }
249 fStack[fStackPtr] = tableEl->fPushState;
250 }
251
252 //
253 // NextChar. This is where characters are actually fetched from the pattern.
254 // Happens under control of the 'n' tag in the state table.
255 //
256 if (tableEl->fNextChar) {
257 nextChar(fC);
258 }
259
260 // Get the next state from the table entry, or from the
261 // state stack if the next state was specified as "pop".
262 if (tableEl->fNextState != 255) {
263 state = tableEl->fNextState;
264 } else {
265 state = fStack[fStackPtr];
266 fStackPtr--;
267 if (fStackPtr < 0) {
268 // state stack underflow
269 // This will occur if the user pattern has mis-matched parentheses,
270 // with extra close parens.
271 //
272 fStackPtr++;
273 error(U_REGEX_MISMATCHED_PAREN);
274 }
275 }
276
277 }
278
279 if (U_FAILURE(*fStatus)) {
280 // Bail out if the pattern had errors.
281 // Set stack cleanup: a successful compile would have left it empty,
282 // but errors can leave temporary sets hanging around.
283 while (!fSetStack.empty()) {
284 delete (UnicodeSet *)fSetStack.pop();
285 }
286 return;
287 }
288
289 //
290 // The pattern has now been read and processed, and the compiled code generated.
291 //
292
293 //
294 // The pattern's fFrameSize so far has accumulated the requirements for
295 // storage for capture parentheses, counters, etc. that are encountered
296 // in the pattern. Add space for the two variables that are always
297 // present in the saved state: the input string position (int64_t) and
298 // the position in the compiled pattern.
299 //
300 allocateStackData(RESTACKFRAME_HDRCOUNT);
301
302 //
303 // Optimization pass 1: NOPs, back-references, and case-folding
304 //
305 stripNOPs();
306
307 //
308 // Get bounds for the minimum and maximum length of a string that this
309 // pattern can match. Used to avoid looking for matches in strings that
310 // are too short.
311 //
312 fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
313
314 //
315 // Optimization pass 2: match start type
316 //
317 matchStartType();
318
319 //
320 // Set up fast latin-1 range sets
321 //
322 int32_t numSets = fRXPat->fSets->size();
323 fRXPat->fSets8 = new Regex8BitSet[numSets];
324 // Null pointer check.
325 if (fRXPat->fSets8 == NULL) {
326 e = *fStatus = U_MEMORY_ALLOCATION_ERROR;
327 return;
328 }
329 int32_t i;
330 for (i=0; i<numSets; i++) {
331 UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i);
332 fRXPat->fSets8[i].init(s);
333 }
334
335 }
336
337
338
339
340
341 //------------------------------------------------------------------------------
342 //
343 // doParseAction Do some action during regex pattern parsing.
344 // Called by the parse state machine.
345 //
346 // Generation of the match engine PCode happens here, or
347 // in functions called from the parse actions defined here.
348 //
349 //
350 //------------------------------------------------------------------------------
351 UBool RegexCompile::doParseActions(int32_t action)
352 {
353 UBool returnVal = TRUE;
354
355 switch ((Regex_PatternParseAction)action) {
356
357 case doPatStart:
358 // Start of pattern compiles to:
359 //0 SAVE 2 Fall back to position of FAIL
360 //1 jmp 3
361 //2 FAIL Stop if we ever reach here.
362 //3 NOP Dummy, so start of pattern looks the same as
363 // the start of an ( grouping.
364 //4 NOP Resreved, will be replaced by a save if there are
365 // OR | operators at the top level
366 appendOp(URX_STATE_SAVE, 2);
367 appendOp(URX_JMP, 3);
368 appendOp(URX_FAIL, 0);
369
370 // Standard open nonCapture paren action emits the two NOPs and
371 // sets up the paren stack frame.
372 doParseActions(doOpenNonCaptureParen);
373 break;
374
375 case doPatFinish:
376 // We've scanned to the end of the pattern
377 // The end of pattern compiles to:
378 // URX_END
379 // which will stop the runtime match engine.
380 // Encountering end of pattern also behaves like a close paren,
381 // and forces fixups of the State Save at the beginning of the compiled pattern
382 // and of any OR operations at the top level.
383 //
384 handleCloseParen();
385 if (fParenStack.size() > 0) {
386 // Missing close paren in pattern.
387 error(U_REGEX_MISMATCHED_PAREN);
388 }
389
390 // add the END operation to the compiled pattern.
391 appendOp(URX_END, 0);
392
393 // Terminate the pattern compilation state machine.
394 returnVal = FALSE;
395 break;
396
397
398
399 case doOrOperator:
400 // Scanning a '|', as in (A|B)
401 {
402 // Generate code for any pending literals preceding the '|'
403 fixLiterals(FALSE);
404
405 // Insert a SAVE operation at the start of the pattern section preceding
406 // this OR at this level. This SAVE will branch the match forward
407 // to the right hand side of the OR in the event that the left hand
408 // side fails to match and backtracks. Locate the position for the
409 // save from the location on the top of the parentheses stack.
410 int32_t savePosition = fParenStack.popi();
411 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition);
412 U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserved location
413 op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
414 fRXPat->fCompiledPat->setElementAt(op, savePosition);
415
416 // Append an JMP operation into the compiled pattern. The operand for
417 // the JMP will eventually be the location following the ')' for the
418 // group. This will be patched in later, when the ')' is encountered.
419 appendOp(URX_JMP, 0);
420
421 // Push the position of the newly added JMP op onto the parentheses stack.
422 // This registers if for fixup when this block's close paren is encountered.
423 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
424
425 // Append a NOP to the compiled pattern. This is the slot reserved
426 // for a SAVE in the event that there is yet another '|' following
427 // this one.
428 appendOp(URX_NOP, 0);
429 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
430 }
431 break;
432
433
434 case doBeginNamedCapture:
435 // Scanning (?<letter.
436 // The first letter of the name will come through again under doConinueNamedCapture.
437 fCaptureName = new UnicodeString();
438 if (fCaptureName == NULL) {
439 error(U_MEMORY_ALLOCATION_ERROR);
440 }
441 break;
442
443 case doContinueNamedCapture:
444 fCaptureName->append(fC.fChar);
445 break;
446
447 case doBadNamedCapture:
448 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
449 break;
450
451 case doOpenCaptureParen:
452 // Open Capturing Paren, possibly named.
453 // Compile to a
454 // - NOP, which later may be replaced by a save-state if the
455 // parenthesized group gets a * quantifier, followed by
456 // - START_CAPTURE n where n is stack frame offset to the capture group variables.
457 // - NOP, which may later be replaced by a save-state if there
458 // is an '|' alternation within the parens.
459 //
460 // Each capture group gets three slots in the save stack frame:
461 // 0: Capture Group start position (in input string being matched.)
462 // 1: Capture Group end position.
463 // 2: Start of Match-in-progress.
464 // The first two locations are for a completed capture group, and are
465 // referred to by back references and the like.
466 // The third location stores the capture start position when an START_CAPTURE is
467 // encountered. This will be promoted to a completed capture when (and if) the corresponding
468 // END_CAPTURE is encountered.
469 {
470 fixLiterals();
471 appendOp(URX_NOP, 0);
472 int32_t varsLoc = allocateStackData(3); // Reserve three slots in match stack frame.
473 appendOp(URX_START_CAPTURE, varsLoc);
474 appendOp(URX_NOP, 0);
475
476 // On the Parentheses stack, start a new frame and add the postions
477 // of the two NOPs. Depending on what follows in the pattern, the
478 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
479 // address of the end of the parenthesized group.
480 fParenStack.push(fModeFlags, *fStatus); // Match mode state
481 fParenStack.push(capturing, *fStatus); // Frame type.
482 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP location
483 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc
484
485 // Save the mapping from group number to stack frame variable position.
486 fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
487
488 // If this is a named capture group, add the name->group number mapping.
489 if (fCaptureName != NULL) {
490 int32_t groupNumber = fRXPat->fGroupMap->size();
491 int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
492 fCaptureName = NULL; // hash table takes ownership of the name (key) string.
493 if (previousMapping > 0 && U_SUCCESS(*fStatus)) {
494 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
495 }
496 }
497 }
498 break;
499
500 case doOpenNonCaptureParen:
501 // Open non-caputuring (grouping only) Paren.
502 // Compile to a
503 // - NOP, which later may be replaced by a save-state if the
504 // parenthesized group gets a * quantifier, followed by
505 // - NOP, which may later be replaced by a save-state if there
506 // is an '|' alternation within the parens.
507 {
508 fixLiterals();
509 appendOp(URX_NOP, 0);
510 appendOp(URX_NOP, 0);
511
512 // On the Parentheses stack, start a new frame and add the postions
513 // of the two NOPs.
514 fParenStack.push(fModeFlags, *fStatus); // Match mode state
515 fParenStack.push(plain, *fStatus); // Begin a new frame.
516 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
517 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc
518 }
519 break;
520
521
522 case doOpenAtomicParen:
523 // Open Atomic Paren. (?>
524 // Compile to a
525 // - NOP, which later may be replaced if the parenthesized group
526 // has a quantifier, followed by
527 // - STO_SP save state stack position, so it can be restored at the ")"
528 // - NOP, which may later be replaced by a save-state if there
529 // is an '|' alternation within the parens.
530 {
531 fixLiterals();
532 appendOp(URX_NOP, 0);
533 int32_t varLoc = allocateData(1); // Reserve a data location for saving the state stack ptr.
534 appendOp(URX_STO_SP, varLoc);
535 appendOp(URX_NOP, 0);
536
537 // On the Parentheses stack, start a new frame and add the postions
538 // of the two NOPs. Depending on what follows in the pattern, the
539 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
540 // address of the end of the parenthesized group.
541 fParenStack.push(fModeFlags, *fStatus); // Match mode state
542 fParenStack.push(atomic, *fStatus); // Frame type.
543 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP
544 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
545 }
546 break;
547
548
549 case doOpenLookAhead:
550 // Positive Look-ahead (?= stuff )
551 //
552 // Note: Addition of transparent input regions, with the need to
553 // restore the original regions when failing out of a lookahead
554 // block, complicated this sequence. Some conbined opcodes
555 // might make sense - or might not, lookahead aren't that common.
556 //
557 // Caution: min match length optimization knows about this
558 // sequence; don't change without making updates there too.
559 //
560 // Compiles to
561 // 1 START_LA dataLoc Saves SP, Input Pos
562 // 2. STATE_SAVE 4 on failure of lookahead, goto 4
563 // 3 JMP 6 continue ...
564 //
565 // 4. LA_END Look Ahead failed. Restore regions.
566 // 5. BACKTRACK and back track again.
567 //
568 // 6. NOP reserved for use by quantifiers on the block.
569 // Look-ahead can't have quantifiers, but paren stack
570 // compile time conventions require the slot anyhow.
571 // 7. NOP may be replaced if there is are '|' ops in the block.
572 // 8. code for parenthesized stuff.
573 // 9. LA_END
574 //
575 // Two data slots are reserved, for saving the stack ptr and the input position.
576 {
577 fixLiterals();
578 int32_t dataLoc = allocateData(2);
579 appendOp(URX_LA_START, dataLoc);
580 appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
581 appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
582 appendOp(URX_LA_END, dataLoc);
583 appendOp(URX_BACKTRACK, 0);
584 appendOp(URX_NOP, 0);
585 appendOp(URX_NOP, 0);
586
587 // On the Parentheses stack, start a new frame and add the postions
588 // of the NOPs.
589 fParenStack.push(fModeFlags, *fStatus); // Match mode state
590 fParenStack.push(lookAhead, *fStatus); // Frame type.
591 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
592 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location
593 }
594 break;
595
596 case doOpenLookAheadNeg:
597 // Negated Lookahead. (?! stuff )
598 // Compiles to
599 // 1. START_LA dataloc
600 // 2. SAVE_STATE 7 // Fail within look-ahead block restores to this state,
601 // // which continues with the match.
602 // 3. NOP // Std. Open Paren sequence, for possible '|'
603 // 4. code for parenthesized stuff.
604 // 5. END_LA // Cut back stack, remove saved state from step 2.
605 // 6. BACKTRACK // code in block succeeded, so neg. lookahead fails.
606 // 7. END_LA // Restore match region, in case look-ahead was using
607 // an alternate (transparent) region.
608 {
609 fixLiterals();
610 int32_t dataLoc = allocateData(2);
611 appendOp(URX_LA_START, dataLoc);
612 appendOp(URX_STATE_SAVE, 0); // dest address will be patched later.
613 appendOp(URX_NOP, 0);
614
615 // On the Parentheses stack, start a new frame and add the postions
616 // of the StateSave and NOP.
617 fParenStack.push(fModeFlags, *fStatus); // Match mode state
618 fParenStack.push(negLookAhead, *fStatus); // Frame type
619 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The STATE_SAVE location
620 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location
621
622 // Instructions #5 - #7 will be added when the ')' is encountered.
623 }
624 break;
625
626 case doOpenLookBehind:
627 {
628 // Compile a (?<= look-behind open paren.
629 //
630 // Compiles to
631 // 0 URX_LB_START dataLoc
632 // 1 URX_LB_CONT dataLoc
633 // 2 MinMatchLen
634 // 3 MaxMatchLen
635 // 4 URX_NOP Standard '(' boilerplate.
636 // 5 URX_NOP Reserved slot for use with '|' ops within (block).
637 // 6 <code for LookBehind expression>
638 // 7 URX_LB_END dataLoc # Check match len, restore input len
639 // 8 URX_LA_END dataLoc # Restore stack, input pos
640 //
641 // Allocate a block of matcher data, to contain (when running a match)
642 // 0: Stack ptr on entry
643 // 1: Input Index on entry
644 // 2: Start index of match current match attempt.
645 // 3: Original Input String len.
646
647 // Generate match code for any pending literals.
648 fixLiterals();
649
650 // Allocate data space
651 int32_t dataLoc = allocateData(4);
652
653 // Emit URX_LB_START
654 appendOp(URX_LB_START, dataLoc);
655
656 // Emit URX_LB_CONT
657 appendOp(URX_LB_CONT, dataLoc);
658 appendOp(URX_RESERVED_OP, 0); // MinMatchLength. To be filled later.
659 appendOp(URX_RESERVED_OP, 0); // MaxMatchLength. To be filled later.
660
661 // Emit the NOPs
662 appendOp(URX_NOP, 0);
663 appendOp(URX_NOP, 0);
664
665 // On the Parentheses stack, start a new frame and add the postions
666 // of the URX_LB_CONT and the NOP.
667 fParenStack.push(fModeFlags, *fStatus); // Match mode state
668 fParenStack.push(lookBehind, *fStatus); // Frame type
669 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
670 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location
671
672 // The final two instructions will be added when the ')' is encountered.
673 }
674
675 break;
676
677 case doOpenLookBehindNeg:
678 {
679 // Compile a (?<! negated look-behind open paren.
680 //
681 // Compiles to
682 // 0 URX_LB_START dataLoc # Save entry stack, input len
683 // 1 URX_LBN_CONT dataLoc # Iterate possible match positions
684 // 2 MinMatchLen
685 // 3 MaxMatchLen
686 // 4 continueLoc (9)
687 // 5 URX_NOP Standard '(' boilerplate.
688 // 6 URX_NOP Reserved slot for use with '|' ops within (block).
689 // 7 <code for LookBehind expression>
690 // 8 URX_LBN_END dataLoc # Check match len, cause a FAIL
691 // 9 ...
692 //
693 // Allocate a block of matcher data, to contain (when running a match)
694 // 0: Stack ptr on entry
695 // 1: Input Index on entry
696 // 2: Start index of match current match attempt.
697 // 3: Original Input String len.
698
699 // Generate match code for any pending literals.
700 fixLiterals();
701
702 // Allocate data space
703 int32_t dataLoc = allocateData(4);
704
705 // Emit URX_LB_START
706 appendOp(URX_LB_START, dataLoc);
707
708 // Emit URX_LBN_CONT
709 appendOp(URX_LBN_CONT, dataLoc);
710 appendOp(URX_RESERVED_OP, 0); // MinMatchLength. To be filled later.
711 appendOp(URX_RESERVED_OP, 0); // MaxMatchLength. To be filled later.
712 appendOp(URX_RESERVED_OP, 0); // Continue Loc. To be filled later.
713
714 // Emit the NOPs
715 appendOp(URX_NOP, 0);
716 appendOp(URX_NOP, 0);
717
718 // On the Parentheses stack, start a new frame and add the postions
719 // of the URX_LB_CONT and the NOP.
720 fParenStack.push(fModeFlags, *fStatus); // Match mode state
721 fParenStack.push(lookBehindN, *fStatus); // Frame type
722 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
723 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location
724
725 // The final two instructions will be added when the ')' is encountered.
726 }
727 break;
728
729 case doConditionalExpr:
730 // Conditionals such as (?(1)a:b)
731 case doPerlInline:
732 // Perl inline-condtionals. (?{perl code}a|b) We're not perl, no way to do them.
733 error(U_REGEX_UNIMPLEMENTED);
734 break;
735
736
737 case doCloseParen:
738 handleCloseParen();
739 if (fParenStack.size() <= 0) {
740 // Extra close paren, or missing open paren.
741 error(U_REGEX_MISMATCHED_PAREN);
742 }
743 break;
744
745 case doNOP:
746 break;
747
748
749 case doBadOpenParenType:
750 case doRuleError:
751 error(U_REGEX_RULE_SYNTAX);
752 break;
753
754
755 case doMismatchedParenErr:
756 error(U_REGEX_MISMATCHED_PAREN);
757 break;
758
759 case doPlus:
760 // Normal '+' compiles to
761 // 1. stuff to be repeated (already built)
762 // 2. jmp-sav 1
763 // 3. ...
764 //
765 // Or, if the item to be repeated can match a zero length string,
766 // 1. STO_INP_LOC data-loc
767 // 2. body of stuff to be repeated
768 // 3. JMP_SAV_X 2
769 // 4. ...
770
771 //
772 // Or, if the item to be repeated is simple
773 // 1. Item to be repeated.
774 // 2. LOOP_SR_I set number (assuming repeated item is a set ref)
775 // 3. LOOP_C stack location
776 {
777 int32_t topLoc = blockTopLoc(FALSE); // location of item #1
778 int32_t frameLoc;
779
780 // Check for simple constructs, which may get special optimized code.
781 if (topLoc == fRXPat->fCompiledPat->size() - 1) {
782 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
783
784 if (URX_TYPE(repeatedOp) == URX_SETREF) {
785 // Emit optimized code for [char set]+
786 appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
787 frameLoc = allocateStackData(1);
788 appendOp(URX_LOOP_C, frameLoc);
789 break;
790 }
791
792 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
793 URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
794 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
795 // Emit Optimized code for .+ operations.
796 int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
797 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
798 // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
799 loopOpI |= 1;
800 }
801 if (fModeFlags & UREGEX_UNIX_LINES) {
802 loopOpI |= 2;
803 }
804 appendOp(loopOpI);
805 frameLoc = allocateStackData(1);
806 appendOp(URX_LOOP_C, frameLoc);
807 break;
808 }
809
810 }
811
812 // General case.
813
814 // Check for minimum match length of zero, which requires
815 // extra loop-breaking code.
816 if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) {
817 // Zero length match is possible.
818 // Emit the code sequence that can handle it.
819 insertOp(topLoc);
820 frameLoc = allocateStackData(1);
821
822 int32_t op = buildOp(URX_STO_INP_LOC, frameLoc);
823 fRXPat->fCompiledPat->setElementAt(op, topLoc);
824
825 appendOp(URX_JMP_SAV_X, topLoc+1);
826 } else {
827 // Simpler code when the repeated body must match something non-empty
828 appendOp(URX_JMP_SAV, topLoc);
829 }
830 }
831 break;
832
833 case doNGPlus:
834 // Non-greedy '+?' compiles to
835 // 1. stuff to be repeated (already built)
836 // 2. state-save 1
837 // 3. ...
838 {
839 int32_t topLoc = blockTopLoc(FALSE);
840 appendOp(URX_STATE_SAVE, topLoc);
841 }
842 break;
843
844
845 case doOpt:
846 // Normal (greedy) ? quantifier.
847 // Compiles to
848 // 1. state save 3
849 // 2. body of optional block
850 // 3. ...
851 // Insert the state save into the compiled pattern, and we're done.
852 {
853 int32_t saveStateLoc = blockTopLoc(TRUE);
854 int32_t saveStateOp = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
855 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
856 }
857 break;
858
859 case doNGOpt:
860 // Non-greedy ?? quantifier
861 // compiles to
862 // 1. jmp 4
863 // 2. body of optional block
864 // 3 jmp 5
865 // 4. state save 2
866 // 5 ...
867 // This code is less than ideal, with two jmps instead of one, because we can only
868 // insert one instruction at the top of the block being iterated.
869 {
870 int32_t jmp1_loc = blockTopLoc(TRUE);
871 int32_t jmp2_loc = fRXPat->fCompiledPat->size();
872
873 int32_t jmp1_op = buildOp(URX_JMP, jmp2_loc+1);
874 fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
875
876 appendOp(URX_JMP, jmp2_loc+2);
877
878 appendOp(URX_STATE_SAVE, jmp1_loc+1);
879 }
880 break;
881
882
883 case doStar:
884 // Normal (greedy) * quantifier.
885 // Compiles to
886 // 1. STATE_SAVE 4
887 // 2. body of stuff being iterated over
888 // 3. JMP_SAV 2
889 // 4. ...
890 //
891 // Or, if the body is a simple [Set],
892 // 1. LOOP_SR_I set number
893 // 2. LOOP_C stack location
894 // ...
895 //
896 // Or if this is a .*
897 // 1. LOOP_DOT_I (. matches all mode flag)
898 // 2. LOOP_C stack location
899 //
900 // Or, if the body can match a zero-length string, to inhibit infinite loops,
901 // 1. STATE_SAVE 5
902 // 2. STO_INP_LOC data-loc
903 // 3. body of stuff
904 // 4. JMP_SAV_X 2
905 // 5. ...
906 {
907 // location of item #1, the STATE_SAVE
908 int32_t topLoc = blockTopLoc(FALSE);
909 int32_t dataLoc = -1;
910
911 // Check for simple *, where the construct being repeated
912 // compiled to single opcode, and might be optimizable.
913 if (topLoc == fRXPat->fCompiledPat->size() - 1) {
914 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
915
916 if (URX_TYPE(repeatedOp) == URX_SETREF) {
917 // Emit optimized code for a [char set]*
918 int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
919 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
920 dataLoc = allocateStackData(1);
921 appendOp(URX_LOOP_C, dataLoc);
922 break;
923 }
924
925 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
926 URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
927 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
928 // Emit Optimized code for .* operations.
929 int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
930 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
931 // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
932 loopOpI |= 1;
933 }
934 if ((fModeFlags & UREGEX_UNIX_LINES) != 0) {
935 loopOpI |= 2;
936 }
937 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
938 dataLoc = allocateStackData(1);
939 appendOp(URX_LOOP_C, dataLoc);
940 break;
941 }
942 }
943
944 // Emit general case code for this *
945 // The optimizations did not apply.
946
947 int32_t saveStateLoc = blockTopLoc(TRUE);
948 int32_t jmpOp = buildOp(URX_JMP_SAV, saveStateLoc+1);
949
950 // Check for minimum match length of zero, which requires
951 // extra loop-breaking code.
952 if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
953 insertOp(saveStateLoc);
954 dataLoc = allocateStackData(1);
955
956 int32_t op = buildOp(URX_STO_INP_LOC, dataLoc);
957 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
958 jmpOp = buildOp(URX_JMP_SAV_X, saveStateLoc+2);
959 }
960
961 // Locate the position in the compiled pattern where the match will continue
962 // after completing the *. (4 or 5 in the comment above)
963 int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
964
965 // Put together the save state op and store it into the compiled code.
966 int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc);
967 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
968
969 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
970 appendOp(jmpOp);
971 }
972 break;
973
974 case doNGStar:
975 // Non-greedy *? quantifier
976 // compiles to
977 // 1. JMP 3
978 // 2. body of stuff being iterated over
979 // 3. STATE_SAVE 2
980 // 4 ...
981 {
982 int32_t jmpLoc = blockTopLoc(TRUE); // loc 1.
983 int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3.
984 int32_t jmpOp = buildOp(URX_JMP, saveLoc);
985 fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
986 appendOp(URX_STATE_SAVE, jmpLoc+1);
987 }
988 break;
989
990
991 case doIntervalInit:
992 // The '{' opening an interval quantifier was just scanned.
993 // Init the counter varaiables that will accumulate the values as the digits
994 // are scanned.
995 fIntervalLow = 0;
996 fIntervalUpper = -1;
997 break;
998
999 case doIntevalLowerDigit:
1000 // Scanned a digit from the lower value of an {lower,upper} interval
1001 {
1002 int32_t digitValue = u_charDigitValue(fC.fChar);
1003 U_ASSERT(digitValue >= 0);
1004 int64_t val = (int64_t)fIntervalLow*10 + digitValue;
1005 if (val > INT32_MAX) {
1006 error(U_REGEX_NUMBER_TOO_BIG);
1007 } else {
1008 fIntervalLow = (int32_t)val;
1009 }
1010 }
1011 break;
1012
1013 case doIntervalUpperDigit:
1014 // Scanned a digit from the upper value of an {lower,upper} interval
1015 {
1016 if (fIntervalUpper < 0) {
1017 fIntervalUpper = 0;
1018 }
1019 int32_t digitValue = u_charDigitValue(fC.fChar);
1020 U_ASSERT(digitValue >= 0);
1021 int64_t val = (int64_t)fIntervalUpper*10 + digitValue;
1022 if (val > INT32_MAX) {
1023 error(U_REGEX_NUMBER_TOO_BIG);
1024 } else {
1025 fIntervalUpper = (int32_t)val;
1026 }
1027 }
1028 break;
1029
1030 case doIntervalSame:
1031 // Scanned a single value interval like {27}. Upper = Lower.
1032 fIntervalUpper = fIntervalLow;
1033 break;
1034
1035 case doInterval:
1036 // Finished scanning a normal {lower,upper} interval. Generate the code for it.
1037 if (compileInlineInterval() == FALSE) {
1038 compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1039 }
1040 break;
1041
1042 case doPossessiveInterval:
1043 // Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it.
1044 {
1045 // Remember the loc for the top of the block being looped over.
1046 // (Can not reserve a slot in the compiled pattern at this time, because
1047 // compileInterval needs to reserve also, and blockTopLoc can only reserve
1048 // once per block.)
1049 int32_t topLoc = blockTopLoc(FALSE);
1050
1051 // Produce normal looping code.
1052 compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1053
1054 // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
1055 // just as if the loop was inclosed in atomic parentheses.
1056
1057 // First the STO_SP before the start of the loop
1058 insertOp(topLoc);
1059
1060 int32_t varLoc = allocateData(1); // Reserve a data location for saving the
1061 int32_t op = buildOp(URX_STO_SP, varLoc);
1062 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1063
1064 int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi();
1065 U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
1066 loopOp++; // point LoopOp after the just-inserted STO_SP
1067 fRXPat->fCompiledPat->push(loopOp, *fStatus);
1068
1069 // Then the LD_SP after the end of the loop
1070 appendOp(URX_LD_SP, varLoc);
1071 }
1072
1073 break;
1074
1075 case doNGInterval:
1076 // Finished scanning a non-greedy {lower,upper}? interval. Generate the code for it.
1077 compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
1078 break;
1079
1080 case doIntervalError:
1081 error(U_REGEX_BAD_INTERVAL);
1082 break;
1083
1084 case doLiteralChar:
1085 // We've just scanned a "normal" character from the pattern,
1086 literalChar(fC.fChar);
1087 break;
1088
1089
1090 case doEscapedLiteralChar:
1091 // We've just scanned an backslashed escaped character with no
1092 // special meaning. It represents itself.
1093 if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1094 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z]
1095 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z]
1096 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1097 }
1098 literalChar(fC.fChar);
1099 break;
1100
1101
1102 case doDotAny:
1103 // scanned a ".", match any single character.
1104 {
1105 fixLiterals(FALSE);
1106 if (fModeFlags & UREGEX_DOTALL) {
1107 appendOp(URX_DOTANY_ALL, 0);
1108 } else if (fModeFlags & UREGEX_UNIX_LINES) {
1109 appendOp(URX_DOTANY_UNIX, 0);
1110 } else {
1111 appendOp(URX_DOTANY, 0);
1112 }
1113 }
1114 break;
1115
1116 case doCaret:
1117 {
1118 fixLiterals(FALSE);
1119 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1120 appendOp(URX_CARET, 0);
1121 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1122 appendOp(URX_CARET_M, 0);
1123 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1124 appendOp(URX_CARET, 0); // Only testing true start of input.
1125 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1126 appendOp(URX_CARET_M_UNIX, 0);
1127 }
1128 }
1129 break;
1130
1131 case doDollar:
1132 {
1133 fixLiterals(FALSE);
1134 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1135 appendOp(URX_DOLLAR, 0);
1136 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1137 appendOp(URX_DOLLAR_M, 0);
1138 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1139 appendOp(URX_DOLLAR_D, 0);
1140 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1141 appendOp(URX_DOLLAR_MD, 0);
1142 }
1143 }
1144 break;
1145
1146 case doBackslashA:
1147 fixLiterals(FALSE);
1148 appendOp(URX_CARET, 0);
1149 break;
1150
1151 case doBackslashB:
1152 {
1153 #if UCONFIG_NO_BREAK_ITERATION==1
1154 if (fModeFlags & UREGEX_UWORD) {
1155 error(U_UNSUPPORTED_ERROR);
1156 }
1157 #endif
1158 fixLiterals(FALSE);
1159 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1160 appendOp(op, 1);
1161 }
1162 break;
1163
1164 case doBackslashb:
1165 {
1166 #if UCONFIG_NO_BREAK_ITERATION==1
1167 if (fModeFlags & UREGEX_UWORD) {
1168 error(U_UNSUPPORTED_ERROR);
1169 }
1170 #endif
1171 fixLiterals(FALSE);
1172 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1173 appendOp(op, 0);
1174 }
1175 break;
1176
1177 case doBackslashD:
1178 fixLiterals(FALSE);
1179 appendOp(URX_BACKSLASH_D, 1);
1180 break;
1181
1182 case doBackslashd:
1183 fixLiterals(FALSE);
1184 appendOp(URX_BACKSLASH_D, 0);
1185 break;
1186
1187 case doBackslashG:
1188 fixLiterals(FALSE);
1189 appendOp(URX_BACKSLASH_G, 0);
1190 break;
1191
1192 case doBackslashH:
1193 fixLiterals(FALSE);
1194 appendOp(URX_BACKSLASH_H, 1);
1195 break;
1196
1197 case doBackslashh:
1198 fixLiterals(FALSE);
1199 appendOp(URX_BACKSLASH_H, 0);
1200 break;
1201
1202 case doBackslashR:
1203 fixLiterals(FALSE);
1204 appendOp(URX_BACKSLASH_R, 0);
1205 break;
1206
1207 case doBackslashS:
1208 fixLiterals(FALSE);
1209 appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET);
1210 break;
1211
1212 case doBackslashs:
1213 fixLiterals(FALSE);
1214 appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET);
1215 break;
1216
1217 case doBackslashV:
1218 fixLiterals(FALSE);
1219 appendOp(URX_BACKSLASH_V, 1);
1220 break;
1221
1222 case doBackslashv:
1223 fixLiterals(FALSE);
1224 appendOp(URX_BACKSLASH_V, 0);
1225 break;
1226
1227 case doBackslashW:
1228 fixLiterals(FALSE);
1229 appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET);
1230 break;
1231
1232 case doBackslashw:
1233 fixLiterals(FALSE);
1234 appendOp(URX_STATIC_SETREF, URX_ISWORD_SET);
1235 break;
1236
1237 case doBackslashX:
1238 fixLiterals(FALSE);
1239 appendOp(URX_BACKSLASH_X, 0);
1240 break;
1241
1242
1243 case doBackslashZ:
1244 fixLiterals(FALSE);
1245 appendOp(URX_DOLLAR, 0);
1246 break;
1247
1248 case doBackslashz:
1249 fixLiterals(FALSE);
1250 appendOp(URX_BACKSLASH_Z, 0);
1251 break;
1252
1253 case doEscapeError:
1254 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1255 break;
1256
1257 case doExit:
1258 fixLiterals(FALSE);
1259 returnVal = FALSE;
1260 break;
1261
1262 case doProperty:
1263 {
1264 fixLiterals(FALSE);
1265 UnicodeSet *theSet = scanProp();
1266 compileSet(theSet);
1267 }
1268 break;
1269
1270 case doNamedChar:
1271 {
1272 UChar32 c = scanNamedChar();
1273 literalChar(c);
1274 }
1275 break;
1276
1277
1278 case doBackRef:
1279 // BackReference. Somewhat unusual in that the front-end can not completely parse
1280 // the regular expression, because the number of digits to be consumed
1281 // depends on the number of capture groups that have been defined. So
1282 // we have to do it here instead.
1283 {
1284 int32_t numCaptureGroups = fRXPat->fGroupMap->size();
1285 int32_t groupNum = 0;
1286 UChar32 c = fC.fChar;
1287
1288 for (;;) {
1289 // Loop once per digit, for max allowed number of digits in a back reference.
1290 int32_t digit = u_charDigitValue(c);
1291 groupNum = groupNum * 10 + digit;
1292 if (groupNum >= numCaptureGroups) {
1293 break;
1294 }
1295 c = peekCharLL();
1296 if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) {
1297 break;
1298 }
1299 nextCharLL();
1300 }
1301
1302 // Scan of the back reference in the source regexp is complete. Now generate
1303 // the compiled code for it.
1304 // Because capture groups can be forward-referenced by back-references,
1305 // we fill the operand with the capture group number. At the end
1306 // of compilation, it will be changed to the variable's location.
1307 U_ASSERT(groupNum > 0); // Shouldn't happen. '\0' begins an octal escape sequence,
1308 // and shouldn't enter this code path at all.
1309 fixLiterals(FALSE);
1310 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1311 appendOp(URX_BACKREF_I, groupNum);
1312 } else {
1313 appendOp(URX_BACKREF, groupNum);
1314 }
1315 }
1316 break;
1317
1318 case doBeginNamedBackRef:
1319 U_ASSERT(fCaptureName == NULL);
1320 fCaptureName = new UnicodeString;
1321 if (fCaptureName == NULL) {
1322 error(U_MEMORY_ALLOCATION_ERROR);
1323 }
1324 break;
1325
1326 case doContinueNamedBackRef:
1327 fCaptureName->append(fC.fChar);
1328 break;
1329
1330 case doCompleteNamedBackRef:
1331 {
1332 int32_t groupNumber = uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName);
1333 if (groupNumber == 0) {
1334 // Group name has not been defined.
1335 // Could be a forward reference. If we choose to support them at some
1336 // future time, extra mechanism will be required at this point.
1337 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
1338 } else {
1339 // Given the number, handle identically to a \n numbered back reference.
1340 // See comments above, under doBackRef
1341 fixLiterals(FALSE);
1342 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1343 appendOp(URX_BACKREF_I, groupNumber);
1344 } else {
1345 appendOp(URX_BACKREF, groupNumber);
1346 }
1347 }
1348 delete fCaptureName;
1349 fCaptureName = NULL;
1350 break;
1351 }
1352
1353 case doPossessivePlus:
1354 // Possessive ++ quantifier.
1355 // Compiles to
1356 // 1. STO_SP
1357 // 2. body of stuff being iterated over
1358 // 3. STATE_SAVE 5
1359 // 4. JMP 2
1360 // 5. LD_SP
1361 // 6. ...
1362 //
1363 // Note: TODO: This is pretty inefficient. A mass of saved state is built up
1364 // then unconditionally discarded. Perhaps introduce a new opcode. Ticket 6056
1365 //
1366 {
1367 // Emit the STO_SP
1368 int32_t topLoc = blockTopLoc(TRUE);
1369 int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr.
1370 int32_t op = buildOp(URX_STO_SP, stoLoc);
1371 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1372
1373 // Emit the STATE_SAVE
1374 appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
1375
1376 // Emit the JMP
1377 appendOp(URX_JMP, topLoc+1);
1378
1379 // Emit the LD_SP
1380 appendOp(URX_LD_SP, stoLoc);
1381 }
1382 break;
1383
1384 case doPossessiveStar:
1385 // Possessive *+ quantifier.
1386 // Compiles to
1387 // 1. STO_SP loc
1388 // 2. STATE_SAVE 5
1389 // 3. body of stuff being iterated over
1390 // 4. JMP 2
1391 // 5. LD_SP loc
1392 // 6 ...
1393 // TODO: do something to cut back the state stack each time through the loop.
1394 {
1395 // Reserve two slots at the top of the block.
1396 int32_t topLoc = blockTopLoc(TRUE);
1397 insertOp(topLoc);
1398
1399 // emit STO_SP loc
1400 int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr.
1401 int32_t op = buildOp(URX_STO_SP, stoLoc);
1402 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1403
1404 // Emit the SAVE_STATE 5
1405 int32_t L7 = fRXPat->fCompiledPat->size()+1;
1406 op = buildOp(URX_STATE_SAVE, L7);
1407 fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1408
1409 // Append the JMP operation.
1410 appendOp(URX_JMP, topLoc+1);
1411
1412 // Emit the LD_SP loc
1413 appendOp(URX_LD_SP, stoLoc);
1414 }
1415 break;
1416
1417 case doPossessiveOpt:
1418 // Possessive ?+ quantifier.
1419 // Compiles to
1420 // 1. STO_SP loc
1421 // 2. SAVE_STATE 5
1422 // 3. body of optional block
1423 // 4. LD_SP loc
1424 // 5. ...
1425 //
1426 {
1427 // Reserve two slots at the top of the block.
1428 int32_t topLoc = blockTopLoc(TRUE);
1429 insertOp(topLoc);
1430
1431 // Emit the STO_SP
1432 int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr.
1433 int32_t op = buildOp(URX_STO_SP, stoLoc);
1434 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1435
1436 // Emit the SAVE_STATE
1437 int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
1438 op = buildOp(URX_STATE_SAVE, continueLoc);
1439 fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1440
1441 // Emit the LD_SP
1442 appendOp(URX_LD_SP, stoLoc);
1443 }
1444 break;
1445
1446
1447 case doBeginMatchMode:
1448 fNewModeFlags = fModeFlags;
1449 fSetModeFlag = TRUE;
1450 break;
1451
1452 case doMatchMode: // (?i) and similar
1453 {
1454 int32_t bit = 0;
1455 switch (fC.fChar) {
1456 case 0x69: /* 'i' */ bit = UREGEX_CASE_INSENSITIVE; break;
1457 case 0x64: /* 'd' */ bit = UREGEX_UNIX_LINES; break;
1458 case 0x6d: /* 'm' */ bit = UREGEX_MULTILINE; break;
1459 case 0x73: /* 's' */ bit = UREGEX_DOTALL; break;
1460 case 0x75: /* 'u' */ bit = 0; /* Unicode casing */ break;
1461 case 0x77: /* 'w' */ bit = UREGEX_UWORD; break;
1462 case 0x78: /* 'x' */ bit = UREGEX_COMMENTS; break;
1463 case 0x2d: /* '-' */ fSetModeFlag = FALSE; break;
1464 default:
1465 U_ASSERT(FALSE); // Should never happen. Other chars are filtered out
1466 // by the scanner.
1467 }
1468 if (fSetModeFlag) {
1469 fNewModeFlags |= bit;
1470 } else {
1471 fNewModeFlags &= ~bit;
1472 }
1473 }
1474 break;
1475
1476 case doSetMatchMode:
1477 // Emit code to match any pending literals, using the not-yet changed match mode.
1478 fixLiterals();
1479
1480 // We've got a (?i) or similar. The match mode is being changed, but
1481 // the change is not scoped to a parenthesized block.
1482 U_ASSERT(fNewModeFlags < 0);
1483 fModeFlags = fNewModeFlags;
1484
1485 break;
1486
1487
1488 case doMatchModeParen:
1489 // We've got a (?i: or similar. Begin a parenthesized block, save old
1490 // mode flags so they can be restored at the close of the block.
1491 //
1492 // Compile to a
1493 // - NOP, which later may be replaced by a save-state if the
1494 // parenthesized group gets a * quantifier, followed by
1495 // - NOP, which may later be replaced by a save-state if there
1496 // is an '|' alternation within the parens.
1497 {
1498 fixLiterals(FALSE);
1499 appendOp(URX_NOP, 0);
1500 appendOp(URX_NOP, 0);
1501
1502 // On the Parentheses stack, start a new frame and add the postions
1503 // of the two NOPs (a normal non-capturing () frame, except for the
1504 // saving of the orignal mode flags.)
1505 fParenStack.push(fModeFlags, *fStatus);
1506 fParenStack.push(flags, *fStatus); // Frame Marker
1507 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP
1508 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
1509
1510 // Set the current mode flags to the new values.
1511 U_ASSERT(fNewModeFlags < 0);
1512 fModeFlags = fNewModeFlags;
1513 }
1514 break;
1515
1516 case doBadModeFlag:
1517 error(U_REGEX_INVALID_FLAG);
1518 break;
1519
1520 case doSuppressComments:
1521 // We have just scanned a '(?'. We now need to prevent the character scanner from
1522 // treating a '#' as a to-the-end-of-line comment.
1523 // (This Perl compatibility just gets uglier and uglier to do...)
1524 fEOLComments = FALSE;
1525 break;
1526
1527
1528 case doSetAddAmp:
1529 {
1530 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1531 set->add(chAmp);
1532 }
1533 break;
1534
1535 case doSetAddDash:
1536 {
1537 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1538 set->add(chDash);
1539 }
1540 break;
1541
1542 case doSetBackslash_s:
1543 {
1544 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1545 set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1546 break;
1547 }
1548
1549 case doSetBackslash_S:
1550 {
1551 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1552 UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1553 SSet.complement();
1554 set->addAll(SSet);
1555 break;
1556 }
1557
1558 case doSetBackslash_d:
1559 {
1560 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1561 // TODO - make a static set, ticket 6058.
1562 addCategory(set, U_GC_ND_MASK, *fStatus);
1563 break;
1564 }
1565
1566 case doSetBackslash_D:
1567 {
1568 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1569 UnicodeSet digits;
1570 // TODO - make a static set, ticket 6058.
1571 digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
1572 digits.complement();
1573 set->addAll(digits);
1574 break;
1575 }
1576
1577 case doSetBackslash_h:
1578 {
1579 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1580 UnicodeSet h;
1581 h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1582 h.add((UChar32)9); // Tab
1583 set->addAll(h);
1584 break;
1585 }
1586
1587 case doSetBackslash_H:
1588 {
1589 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1590 UnicodeSet h;
1591 h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1592 h.add((UChar32)9); // Tab
1593 h.complement();
1594 set->addAll(h);
1595 break;
1596 }
1597
1598 case doSetBackslash_v:
1599 {
1600 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1601 set->add((UChar32)0x0a, (UChar32)0x0d); // add range
1602 set->add((UChar32)0x85);
1603 set->add((UChar32)0x2028, (UChar32)0x2029);
1604 break;
1605 }
1606
1607 case doSetBackslash_V:
1608 {
1609 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1610 UnicodeSet v;
1611 v.add((UChar32)0x0a, (UChar32)0x0d); // add range
1612 v.add((UChar32)0x85);
1613 v.add((UChar32)0x2028, (UChar32)0x2029);
1614 v.complement();
1615 set->addAll(v);
1616 break;
1617 }
1618
1619 case doSetBackslash_w:
1620 {
1621 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1622 set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1623 break;
1624 }
1625
1626 case doSetBackslash_W:
1627 {
1628 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1629 UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1630 SSet.complement();
1631 set->addAll(SSet);
1632 break;
1633 }
1634
1635 case doSetBegin:
1636 fixLiterals(FALSE);
1637 fSetStack.push(new UnicodeSet(), *fStatus);
1638 fSetOpStack.push(setStart, *fStatus);
1639 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1640 fSetOpStack.push(setCaseClose, *fStatus);
1641 }
1642 break;
1643
1644 case doSetBeginDifference1:
1645 // We have scanned something like [[abc]-[
1646 // Set up a new UnicodeSet for the set beginning with the just-scanned '['
1647 // Push a Difference operator, which will cause the new set to be subtracted from what
1648 // went before once it is created.
1649 setPushOp(setDifference1);
1650 fSetOpStack.push(setStart, *fStatus);
1651 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1652 fSetOpStack.push(setCaseClose, *fStatus);
1653 }
1654 break;
1655
1656 case doSetBeginIntersection1:
1657 // We have scanned something like [[abc]&[
1658 // Need both the '&' operator and the open '[' operator.
1659 setPushOp(setIntersection1);
1660 fSetOpStack.push(setStart, *fStatus);
1661 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1662 fSetOpStack.push(setCaseClose, *fStatus);
1663 }
1664 break;
1665
1666 case doSetBeginUnion:
1667 // We have scanned something like [[abc][
1668 // Need to handle the union operation explicitly [[abc] | [
1669 setPushOp(setUnion);
1670 fSetOpStack.push(setStart, *fStatus);
1671 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1672 fSetOpStack.push(setCaseClose, *fStatus);
1673 }
1674 break;
1675
1676 case doSetDifference2:
1677 // We have scanned something like [abc--
1678 // Consider this to unambiguously be a set difference operator.
1679 setPushOp(setDifference2);
1680 break;
1681
1682 case doSetEnd:
1683 // Have encountered the ']' that closes a set.
1684 // Force the evaluation of any pending operations within this set,
1685 // leave the completed set on the top of the set stack.
1686 setEval(setEnd);
1687 U_ASSERT(fSetOpStack.peeki()==setStart);
1688 fSetOpStack.popi();
1689 break;
1690
1691 case doSetFinish:
1692 {
1693 // Finished a complete set expression, including all nested sets.
1694 // The close bracket has already triggered clearing out pending set operators,
1695 // the operator stack should be empty and the operand stack should have just
1696 // one entry, the result set.
1697 U_ASSERT(fSetOpStack.empty());
1698 UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop();
1699 U_ASSERT(fSetStack.empty());
1700 compileSet(theSet);
1701 break;
1702 }
1703
1704 case doSetIntersection2:
1705 // Have scanned something like [abc&&
1706 setPushOp(setIntersection2);
1707 break;
1708
1709 case doSetLiteral:
1710 // Union the just-scanned literal character into the set being built.
1711 // This operation is the highest precedence set operation, so we can always do
1712 // it immediately, without waiting to see what follows. It is necessary to perform
1713 // any pending '-' or '&' operation first, because these have the same precedence
1714 // as union-ing in a literal'
1715 {
1716 setEval(setUnion);
1717 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1718 s->add(fC.fChar);
1719 fLastSetLiteral = fC.fChar;
1720 break;
1721 }
1722
1723 case doSetLiteralEscaped:
1724 // A back-slash escaped literal character was encountered.
1725 // Processing is the same as with setLiteral, above, with the addition of
1726 // the optional check for errors on escaped ASCII letters.
1727 {
1728 if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1729 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z]
1730 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z]
1731 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1732 }
1733 setEval(setUnion);
1734 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1735 s->add(fC.fChar);
1736 fLastSetLiteral = fC.fChar;
1737 break;
1738 }
1739
1740 case doSetNamedChar:
1741 // Scanning a \N{UNICODE CHARACTER NAME}
1742 // Aside from the source of the character, the processing is identical to doSetLiteral,
1743 // above.
1744 {
1745 UChar32 c = scanNamedChar();
1746 setEval(setUnion);
1747 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1748 s->add(c);
1749 fLastSetLiteral = c;
1750 break;
1751 }
1752
1753 case doSetNamedRange:
1754 // We have scanned literal-\N{CHAR NAME}. Add the range to the set.
1755 // The left character is already in the set, and is saved in fLastSetLiteral.
1756 // The right side needs to be picked up, the scan is at the 'N'.
1757 // Lower Limit > Upper limit being an error matches both Java
1758 // and ICU UnicodeSet behavior.
1759 {
1760 UChar32 c = scanNamedChar();
1761 if (U_SUCCESS(*fStatus) && (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > c)) {
1762 error(U_REGEX_INVALID_RANGE);
1763 }
1764 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1765 s->add(fLastSetLiteral, c);
1766 fLastSetLiteral = c;
1767 break;
1768 }
1769
1770
1771 case doSetNegate:
1772 // Scanned a '^' at the start of a set.
1773 // Push the negation operator onto the set op stack.
1774 // A twist for case-insensitive matching:
1775 // the case closure operation must happen _before_ negation.
1776 // But the case closure operation will already be on the stack if it's required.
1777 // This requires checking for case closure, and swapping the stack order
1778 // if it is present.
1779 {
1780 int32_t tosOp = fSetOpStack.peeki();
1781 if (tosOp == setCaseClose) {
1782 fSetOpStack.popi();
1783 fSetOpStack.push(setNegation, *fStatus);
1784 fSetOpStack.push(setCaseClose, *fStatus);
1785 } else {
1786 fSetOpStack.push(setNegation, *fStatus);
1787 }
1788 }
1789 break;
1790
1791 case doSetNoCloseError:
1792 error(U_REGEX_MISSING_CLOSE_BRACKET);
1793 break;
1794
1795 case doSetOpError:
1796 error(U_REGEX_RULE_SYNTAX); // -- or && at the end of a set. Illegal.
1797 break;
1798
1799 case doSetPosixProp:
1800 {
1801 UnicodeSet *s = scanPosixProp();
1802 if (s != NULL) {
1803 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1804 tos->addAll(*s);
1805 delete s;
1806 } // else error. scanProp() reported the error status already.
1807 }
1808 break;
1809
1810 case doSetProp:
1811 // Scanned a \p \P within [brackets].
1812 {
1813 UnicodeSet *s = scanProp();
1814 if (s != NULL) {
1815 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1816 tos->addAll(*s);
1817 delete s;
1818 } // else error. scanProp() reported the error status already.
1819 }
1820 break;
1821
1822
1823 case doSetRange:
1824 // We have scanned literal-literal. Add the range to the set.
1825 // The left character is already in the set, and is saved in fLastSetLiteral.
1826 // The right side is the current character.
1827 // Lower Limit > Upper limit being an error matches both Java
1828 // and ICU UnicodeSet behavior.
1829 {
1830
1831 if (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > fC.fChar) {
1832 error(U_REGEX_INVALID_RANGE);
1833 }
1834 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1835 s->add(fLastSetLiteral, fC.fChar);
1836 break;
1837 }
1838
1839 default:
1840 U_ASSERT(FALSE);
1841 error(U_REGEX_INTERNAL_ERROR);
1842 break;
1843 }
1844
1845 if (U_FAILURE(*fStatus)) {
1846 returnVal = FALSE;
1847 }
1848
1849 return returnVal;
1850 }
1851
1852
1853
1854 //------------------------------------------------------------------------------
1855 //
1856 // literalChar We've encountered a literal character from the pattern,
1857 // or an escape sequence that reduces to a character.
1858 // Add it to the string containing all literal chars/strings from
1859 // the pattern.
1860 //
1861 //------------------------------------------------------------------------------
1862 void RegexCompile::literalChar(UChar32 c) {
1863 fLiteralChars.append(c);
1864 }
1865
1866
1867 //------------------------------------------------------------------------------
1868 //
1869 // fixLiterals When compiling something that can follow a literal
1870 // string in a pattern, emit the code to match the
1871 // accumulated literal string.
1872 //
1873 // Optionally, split the last char of the string off into
1874 // a single "ONE_CHAR" operation, so that quantifiers can
1875 // apply to that char alone. Example: abc*
1876 // The * must apply to the 'c' only.
1877 //
1878 //------------------------------------------------------------------------------
1879 void RegexCompile::fixLiterals(UBool split) {
1880
1881 // If no literal characters have been scanned but not yet had code generated
1882 // for them, nothing needs to be done.
1883 if (fLiteralChars.length() == 0) {
1884 return;
1885 }
1886
1887 int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1888 UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1889
1890 // Split: We need to ensure that the last item in the compiled pattern
1891 // refers only to the last literal scanned in the pattern, so that
1892 // quantifiers (*, +, etc.) affect only it, and not a longer string.
1893 // Split before case folding for case insensitive matches.
1894
1895 if (split) {
1896 fLiteralChars.truncate(indexOfLastCodePoint);
1897 fixLiterals(FALSE); // Recursive call, emit code to match the first part of the string.
1898 // Note that the truncated literal string may be empty, in which case
1899 // nothing will be emitted.
1900
1901 literalChar(lastCodePoint); // Re-add the last code point as if it were a new literal.
1902 fixLiterals(FALSE); // Second recursive call, code for the final code point.
1903 return;
1904 }
1905
1906 // If we are doing case-insensitive matching, case fold the string. This may expand
1907 // the string, e.g. the German sharp-s turns into "ss"
1908 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1909 fLiteralChars.foldCase();
1910 indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1911 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1912 }
1913
1914 if (indexOfLastCodePoint == 0) {
1915 // Single character, emit a URX_ONECHAR op to match it.
1916 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
1917 u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) {
1918 appendOp(URX_ONECHAR_I, lastCodePoint);
1919 } else {
1920 appendOp(URX_ONECHAR, lastCodePoint);
1921 }
1922 } else {
1923 // Two or more chars, emit a URX_STRING to match them.
1924 if (fLiteralChars.length() > 0x00ffffff || fRXPat->fLiteralText.length() > 0x00ffffff) {
1925 error(U_REGEX_PATTERN_TOO_BIG);
1926 }
1927 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1928 appendOp(URX_STRING_I, fRXPat->fLiteralText.length());
1929 } else {
1930 // TODO here: add optimization to split case sensitive strings of length two
1931 // into two single char ops, for efficiency.
1932 appendOp(URX_STRING, fRXPat->fLiteralText.length());
1933 }
1934 appendOp(URX_STRING_LEN, fLiteralChars.length());
1935
1936 // Add this string into the accumulated strings of the compiled pattern.
1937 fRXPat->fLiteralText.append(fLiteralChars);
1938 }
1939
1940 fLiteralChars.remove();
1941 }
1942
1943
1944 int32_t RegexCompile::buildOp(int32_t type, int32_t val) {
1945 if (U_FAILURE(*fStatus)) {
1946 return 0;
1947 }
1948 if (type < 0 || type > 255) {
1949 U_ASSERT(FALSE);
1950 error(U_REGEX_INTERNAL_ERROR);
1951 type = URX_RESERVED_OP;
1952 }
1953 if (val > 0x00ffffff) {
1954 U_ASSERT(FALSE);
1955 error(U_REGEX_INTERNAL_ERROR);
1956 val = 0;
1957 }
1958 if (val < 0) {
1959 if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) {
1960 U_ASSERT(FALSE);
1961 error(U_REGEX_INTERNAL_ERROR);
1962 return -1;
1963 }
1964 if (URX_TYPE(val) != 0xff) {
1965 U_ASSERT(FALSE);
1966 error(U_REGEX_INTERNAL_ERROR);
1967 return -1;
1968 }
1969 type = URX_RESERVED_OP_N;
1970 }
1971 return (type << 24) | val;
1972 }
1973
1974
1975 //------------------------------------------------------------------------------
1976 //
1977 // appendOp() Append a new instruction onto the compiled pattern
1978 // Includes error checking, limiting the size of the
1979 // pattern to lengths that can be represented in the
1980 // 24 bit operand field of an instruction.
1981 //
1982 //------------------------------------------------------------------------------
1983 void RegexCompile::appendOp(int32_t op) {
1984 if (U_FAILURE(*fStatus)) {
1985 return;
1986 }
1987 fRXPat->fCompiledPat->addElement(op, *fStatus);
1988 if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) {
1989 error(U_REGEX_PATTERN_TOO_BIG);
1990 }
1991 }
1992
1993 void RegexCompile::appendOp(int32_t type, int32_t val) {
1994 appendOp(buildOp(type, val));
1995 }
1996
1997
1998 //------------------------------------------------------------------------------
1999 //
2000 // insertOp() Insert a slot for a new opcode into the already
2001 // compiled pattern code.
2002 //
2003 // Fill the slot with a NOP. Our caller will replace it
2004 // with what they really wanted.
2005 //
2006 //------------------------------------------------------------------------------
2007 void RegexCompile::insertOp(int32_t where) {
2008 UVector64 *code = fRXPat->fCompiledPat;
2009 U_ASSERT(where>0 && where < code->size());
2010
2011 int32_t nop = buildOp(URX_NOP, 0);
2012 code->insertElementAt(nop, where, *fStatus);
2013
2014 // Walk through the pattern, looking for any ops with targets that
2015 // were moved down by the insert. Fix them.
2016 int32_t loc;
2017 for (loc=0; loc<code->size(); loc++) {
2018 int32_t op = (int32_t)code->elementAti(loc);
2019 int32_t opType = URX_TYPE(op);
2020 int32_t opValue = URX_VAL(op);
2021 if ((opType == URX_JMP ||
2022 opType == URX_JMPX ||
2023 opType == URX_STATE_SAVE ||
2024 opType == URX_CTR_LOOP ||
2025 opType == URX_CTR_LOOP_NG ||
2026 opType == URX_JMP_SAV ||
2027 opType == URX_JMP_SAV_X ||
2028 opType == URX_RELOC_OPRND) && opValue > where) {
2029 // Target location for this opcode is after the insertion point and
2030 // needs to be incremented to adjust for the insertion.
2031 opValue++;
2032 op = buildOp(opType, opValue);
2033 code->setElementAt(op, loc);
2034 }
2035 }
2036
2037 // Now fix up the parentheses stack. All positive values in it are locations in
2038 // the compiled pattern. (Negative values are frame boundaries, and don't need fixing.)
2039 for (loc=0; loc<fParenStack.size(); loc++) {
2040 int32_t x = fParenStack.elementAti(loc);
2041 U_ASSERT(x < code->size());
2042 if (x>where) {
2043 x++;
2044 fParenStack.setElementAt(x, loc);
2045 }
2046 }
2047
2048 if (fMatchCloseParen > where) {
2049 fMatchCloseParen++;
2050 }
2051 if (fMatchOpenParen > where) {
2052 fMatchOpenParen++;
2053 }
2054 }
2055
2056
2057 //------------------------------------------------------------------------------
2058 //
2059 // allocateData() Allocate storage in the matcher's static data area.
2060 // Return the index for the newly allocated data.
2061 // The storage won't actually exist until we are running a match
2062 // operation, but the storage indexes are inserted into various
2063 // opcodes while compiling the pattern.
2064 //
2065 //------------------------------------------------------------------------------
2066 int32_t RegexCompile::allocateData(int32_t size) {
2067 if (U_FAILURE(*fStatus)) {
2068 return 0;
2069 }
2070 if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) {
2071 error(U_REGEX_INTERNAL_ERROR);
2072 return 0;
2073 }
2074 int32_t dataIndex = fRXPat->fDataSize;
2075 fRXPat->fDataSize += size;
2076 if (fRXPat->fDataSize >= 0x00fffff0) {
2077 error(U_REGEX_INTERNAL_ERROR);
2078 }
2079 return dataIndex;
2080 }
2081
2082
2083 //------------------------------------------------------------------------------
2084 //
2085 // allocateStackData() Allocate space in the back-tracking stack frame.
2086 // Return the index for the newly allocated data.
2087 // The frame indexes are inserted into various
2088 // opcodes while compiling the pattern, meaning that frame
2089 // size must be restricted to the size that will fit
2090 // as an operand (24 bits).
2091 //
2092 //------------------------------------------------------------------------------
2093 int32_t RegexCompile::allocateStackData(int32_t size) {
2094 if (U_FAILURE(*fStatus)) {
2095 return 0;
2096 }
2097 if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) {
2098 error(U_REGEX_INTERNAL_ERROR);
2099 return 0;
2100 }
2101 int32_t dataIndex = fRXPat->fFrameSize;
2102 fRXPat->fFrameSize += size;
2103 if (fRXPat->fFrameSize >= 0x00fffff0) {
2104 error(U_REGEX_PATTERN_TOO_BIG);
2105 }
2106 return dataIndex;
2107 }
2108
2109
2110 //------------------------------------------------------------------------------
2111 //
2112 // blockTopLoc() Find or create a location in the compiled pattern
2113 // at the start of the operation or block that has
2114 // just been compiled. Needed when a quantifier (* or
2115 // whatever) appears, and we need to add an operation
2116 // at the start of the thing being quantified.
2117 //
2118 // (Parenthesized Blocks) have a slot with a NOP that
2119 // is reserved for this purpose. .* or similar don't
2120 // and a slot needs to be added.
2121 //
2122 // parameter reserveLoc : TRUE - ensure that there is space to add an opcode
2123 // at the returned location.
2124 // FALSE - just return the address,
2125 // do not reserve a location there.
2126 //
2127 //------------------------------------------------------------------------------
2128 int32_t RegexCompile::blockTopLoc(UBool reserveLoc) {
2129 int32_t theLoc;
2130 fixLiterals(TRUE); // Emit code for any pending literals.
2131 // If last item was a string, emit separate op for the its last char.
2132 if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
2133 {
2134 // The item just processed is a parenthesized block.
2135 theLoc = fMatchOpenParen; // A slot is already reserved for us.
2136 U_ASSERT(theLoc > 0);
2137 U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP);
2138 }
2139 else {
2140 // Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
2141 // No slot for STATE_SAVE was pre-reserved in the compiled code.
2142 // We need to make space now.
2143 theLoc = fRXPat->fCompiledPat->size()-1;
2144 int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc);
2145 if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) {
2146 // Strings take two opcode, we want the position of the first one.
2147 // We can have a string at this point if a single character case-folded to two.
2148 theLoc--;
2149 }
2150 if (reserveLoc) {
2151 int32_t nop = buildOp(URX_NOP, 0);
2152 fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
2153 }
2154 }
2155 return theLoc;
2156 }
2157
2158
2159
2160 //------------------------------------------------------------------------------
2161 //
2162 // handleCloseParen When compiling a close paren, we need to go back
2163 // and fix up any JMP or SAVE operations within the
2164 // parenthesized block that need to target the end
2165 // of the block. The locations of these are kept on
2166 // the paretheses stack.
2167 //
2168 // This function is called both when encountering a
2169 // real ) and at the end of the pattern.
2170 //
2171 //------------------------------------------------------------------------------
2172 void RegexCompile::handleCloseParen() {
2173 int32_t patIdx;
2174 int32_t patOp;
2175 if (fParenStack.size() <= 0) {
2176 error(U_REGEX_MISMATCHED_PAREN);
2177 return;
2178 }
2179
2180 // Emit code for any pending literals.
2181 fixLiterals(FALSE);
2182
2183 // Fixup any operations within the just-closed parenthesized group
2184 // that need to reference the end of the (block).
2185 // (The first one popped from the stack is an unused slot for
2186 // alternation (OR) state save, but applying the fixup to it does no harm.)
2187 for (;;) {
2188 patIdx = fParenStack.popi();
2189 if (patIdx < 0) {
2190 // value < 0 flags the start of the frame on the paren stack.
2191 break;
2192 }
2193 U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size());
2194 patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx);
2195 U_ASSERT(URX_VAL(patOp) == 0); // Branch target for JMP should not be set.
2196 patOp |= fRXPat->fCompiledPat->size(); // Set it now.
2197 fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
2198 fMatchOpenParen = patIdx;
2199 }
2200
2201 // At the close of any parenthesized block, restore the match mode flags to
2202 // the value they had at the open paren. Saved value is
2203 // at the top of the paren stack.
2204 fModeFlags = fParenStack.popi();
2205 U_ASSERT(fModeFlags < 0);
2206
2207 // DO any additional fixups, depending on the specific kind of
2208 // parentesized grouping this is
2209
2210 switch (patIdx) {
2211 case plain:
2212 case flags:
2213 // No additional fixups required.
2214 // (Grouping-only parentheses)
2215 break;
2216 case capturing:
2217 // Capturing Parentheses.
2218 // Insert a End Capture op into the pattern.
2219 // The frame offset of the variables for this cg is obtained from the
2220 // start capture op and put it into the end-capture op.
2221 {
2222 int32_t captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2223 U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
2224
2225 int32_t frameVarLocation = URX_VAL(captureOp);
2226 appendOp(URX_END_CAPTURE, frameVarLocation);
2227 }
2228 break;
2229 case atomic:
2230 // Atomic Parenthesis.
2231 // Insert a LD_SP operation to restore the state stack to the position
2232 // it was when the atomic parens were entered.
2233 {
2234 int32_t stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2235 U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
2236 int32_t stoLoc = URX_VAL(stoOp);
2237 appendOp(URX_LD_SP, stoLoc);
2238 }
2239 break;
2240
2241 case lookAhead:
2242 {
2243 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2244 U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2245 int32_t dataLoc = URX_VAL(startOp);
2246 appendOp(URX_LA_END, dataLoc);
2247 }
2248 break;
2249
2250 case negLookAhead:
2251 {
2252 // See comment at doOpenLookAheadNeg
2253 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
2254 U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2255 int32_t dataLoc = URX_VAL(startOp);
2256 appendOp(URX_LA_END, dataLoc);
2257 appendOp(URX_BACKTRACK, 0);
2258 appendOp(URX_LA_END, dataLoc);
2259
2260 // Patch the URX_SAVE near the top of the block.
2261 // The destination of the SAVE is the final LA_END that was just added.
2262 int32_t saveOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
2263 U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
2264 int32_t dest = fRXPat->fCompiledPat->size()-1;
2265 saveOp = buildOp(URX_STATE_SAVE, dest);
2266 fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
2267 }
2268 break;
2269
2270 case lookBehind:
2271 {
2272 // See comment at doOpenLookBehind.
2273
2274 // Append the URX_LB_END and URX_LA_END to the compiled pattern.
2275 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4);
2276 U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2277 int32_t dataLoc = URX_VAL(startOp);
2278 appendOp(URX_LB_END, dataLoc);
2279 appendOp(URX_LA_END, dataLoc);
2280
2281 // Determine the min and max bounds for the length of the
2282 // string that the pattern can match.
2283 // An unbounded upper limit is an error.
2284 int32_t patEnd = fRXPat->fCompiledPat->size() - 1;
2285 int32_t minML = minMatchLength(fMatchOpenParen, patEnd);
2286 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd);
2287 if (URX_TYPE(maxML) != 0) {
2288 error(U_REGEX_LOOK_BEHIND_LIMIT);
2289 break;
2290 }
2291 if (maxML == INT32_MAX) {
2292 error(U_REGEX_LOOK_BEHIND_LIMIT);
2293 break;
2294 }
2295 U_ASSERT(minML <= maxML);
2296
2297 // Insert the min and max match len bounds into the URX_LB_CONT op that
2298 // appears at the top of the look-behind block, at location fMatchOpenParen+1
2299 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-2);
2300 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-1);
2301
2302 }
2303 break;
2304
2305
2306
2307 case lookBehindN:
2308 {
2309 // See comment at doOpenLookBehindNeg.
2310
2311 // Append the URX_LBN_END to the compiled pattern.
2312 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2313 U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2314 int32_t dataLoc = URX_VAL(startOp);
2315 appendOp(URX_LBN_END, dataLoc);
2316
2317 // Determine the min and max bounds for the length of the
2318 // string that the pattern can match.
2319 // An unbounded upper limit is an error.
2320 int32_t patEnd = fRXPat->fCompiledPat->size() - 1;
2321 int32_t minML = minMatchLength(fMatchOpenParen, patEnd);
2322 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd);
2323 if (URX_TYPE(maxML) != 0) {
2324 error(U_REGEX_LOOK_BEHIND_LIMIT);
2325 break;
2326 }
2327 if (maxML == INT32_MAX) {
2328 error(U_REGEX_LOOK_BEHIND_LIMIT);
2329 break;
2330 }
2331 U_ASSERT(minML <= maxML);
2332
2333 // Insert the min and max match len bounds into the URX_LB_CONT op that
2334 // appears at the top of the look-behind block, at location fMatchOpenParen+1
2335 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-3);
2336 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-2);
2337
2338 // Insert the pattern location to continue at after a successful match
2339 // as the last operand of the URX_LBN_CONT
2340 int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
2341 fRXPat->fCompiledPat->setElementAt(op, fMatchOpenParen-1);
2342 }
2343 break;
2344
2345
2346
2347 default:
2348 U_ASSERT(FALSE);
2349 }
2350
2351 // remember the next location in the compiled pattern.
2352 // The compilation of Quantifiers will look at this to see whether its looping
2353 // over a parenthesized block or a single item
2354 fMatchCloseParen = fRXPat->fCompiledPat->size();
2355 }
2356
2357
2358
2359 //------------------------------------------------------------------------------
2360 //
2361 // compileSet Compile the pattern operations for a reference to a
2362 // UnicodeSet.
2363 //
2364 //------------------------------------------------------------------------------
2365 void RegexCompile::compileSet(UnicodeSet *theSet)
2366 {
2367 if (theSet == NULL) {
2368 return;
2369 }
2370 // Remove any strings from the set.
2371 // There shoudn't be any, but just in case.
2372 // (Case Closure can add them; if we had a simple case closure avaialble that
2373 // ignored strings, that would be better.)
2374 theSet->removeAllStrings();
2375 int32_t setSize = theSet->size();
2376
2377 switch (setSize) {
2378 case 0:
2379 {
2380 // Set of no elements. Always fails to match.
2381 appendOp(URX_BACKTRACK, 0);
2382 delete theSet;
2383 }
2384 break;
2385
2386 case 1:
2387 {
2388 // The set contains only a single code point. Put it into
2389 // the compiled pattern as a single char operation rather
2390 // than a set, and discard the set itself.
2391 literalChar(theSet->charAt(0));
2392 delete theSet;
2393 }
2394 break;
2395
2396 default:
2397 {
2398 // The set contains two or more chars. (the normal case)
2399 // Put it into the compiled pattern as a set.
2400 int32_t setNumber = fRXPat->fSets->size();
2401 fRXPat->fSets->addElement(theSet, *fStatus);
2402 appendOp(URX_SETREF, setNumber);
2403 }
2404 }
2405 }
2406
2407
2408 //------------------------------------------------------------------------------
2409 //
2410 // compileInterval Generate the code for a {min, max} style interval quantifier.
2411 // Except for the specific opcodes used, the code is the same
2412 // for all three types (greedy, non-greedy, possessive) of
2413 // intervals. The opcodes are supplied as parameters.
2414 // (There are two sets of opcodes - greedy & possessive use the
2415 // same ones, while non-greedy has it's own.)
2416 //
2417 // The code for interval loops has this form:
2418 // 0 CTR_INIT counter loc (in stack frame)
2419 // 1 5 patt address of CTR_LOOP at bottom of block
2420 // 2 min count
2421 // 3 max count (-1 for unbounded)
2422 // 4 ... block to be iterated over
2423 // 5 CTR_LOOP
2424 //
2425 // In
2426 //------------------------------------------------------------------------------
2427 void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp)
2428 {
2429 // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
2430 // four slots in the compiled code. Reserve them.
2431 int32_t topOfBlock = blockTopLoc(TRUE);
2432 insertOp(topOfBlock);
2433 insertOp(topOfBlock);
2434 insertOp(topOfBlock);
2435
2436 // The operands for the CTR_INIT opcode include the index in the matcher data
2437 // of the counter. Allocate it now. There are two data items
2438 // counterLoc --> Loop counter
2439 // +1 --> Input index (for breaking non-progressing loops)
2440 // (Only present if unbounded upper limit on loop)
2441 int32_t dataSize = fIntervalUpper < 0 ? 2 : 1;
2442 int32_t counterLoc = allocateStackData(dataSize);
2443
2444 int32_t op = buildOp(InitOp, counterLoc);
2445 fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
2446
2447 // The second operand of CTR_INIT is the location following the end of the loop.
2448 // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
2449 // compilation of something later on causes the code to grow and the target
2450 // position to move.
2451 int32_t loopEnd = fRXPat->fCompiledPat->size();
2452 op = buildOp(URX_RELOC_OPRND, loopEnd);
2453 fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
2454
2455 // Followed by the min and max counts.
2456 fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
2457 fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
2458
2459 // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op.
2460 // Goes at end of the block being looped over, so just append to the code so far.
2461 appendOp(LoopOp, topOfBlock);
2462
2463 if ((fIntervalLow & 0xff000000) != 0 ||
2464 (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) {
2465 error(U_REGEX_NUMBER_TOO_BIG);
2466 }
2467
2468 if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
2469 error(U_REGEX_MAX_LT_MIN);
2470 }
2471 }
2472
2473
2474
2475 UBool RegexCompile::compileInlineInterval() {
2476 if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) {
2477 // Too big to inline. Fail, which will cause looping code to be generated.
2478 // (Upper < Lower picks up unbounded upper and errors, both.)
2479 return FALSE;
2480 }
2481
2482 int32_t topOfBlock = blockTopLoc(FALSE);
2483 if (fIntervalUpper == 0) {
2484 // Pathological case. Attempt no matches, as if the block doesn't exist.
2485 // Discard the generated code for the block.
2486 // If the block included parens, discard the info pertaining to them as well.
2487 fRXPat->fCompiledPat->setSize(topOfBlock);
2488 if (fMatchOpenParen >= topOfBlock) {
2489 fMatchOpenParen = -1;
2490 }
2491 if (fMatchCloseParen >= topOfBlock) {
2492 fMatchCloseParen = -1;
2493 }
2494 return TRUE;
2495 }
2496
2497 if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
2498 // The thing being repeated is not a single op, but some
2499 // more complex block. Do it as a loop, not inlines.
2500 // Note that things "repeated" a max of once are handled as inline, because
2501 // the one copy of the code already generated is just fine.
2502 return FALSE;
2503 }
2504
2505 // Pick up the opcode that is to be repeated
2506 //
2507 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock);
2508
2509 // Compute the pattern location where the inline sequence
2510 // will end, and set up the state save op that will be needed.
2511 //
2512 int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
2513 + fIntervalUpper + (fIntervalUpper-fIntervalLow);
2514 int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc);
2515 if (fIntervalLow == 0) {
2516 insertOp(topOfBlock);
2517 fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
2518 }
2519
2520
2521
2522 // Loop, emitting the op for the thing being repeated each time.
2523 // Loop starts at 1 because one instance of the op already exists in the pattern,
2524 // it was put there when it was originally encountered.
2525 int32_t i;
2526 for (i=1; i<fIntervalUpper; i++ ) {
2527 if (i >= fIntervalLow) {
2528 appendOp(saveOp);
2529 }
2530 appendOp(op);
2531 }
2532 return TRUE;
2533 }
2534
2535
2536
2537 //------------------------------------------------------------------------------
2538 //
2539 // caseInsensitiveStart given a single code point from a pattern string, determine the
2540 // set of characters that could potentially begin a case-insensitive
2541 // match of a string beginning with that character, using full Unicode
2542 // case insensitive matching.
2543 //
2544 // This is used in optimizing find().
2545 //
2546 // closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
2547 // misses cases like this:
2548 // A string from the pattern begins with 'ss' (although all we know
2549 // in this context is that it begins with 's')
2550 // The pattern could match a string beginning with a German sharp-s
2551 //
2552 // To the ordinary case closure for a character c, we add all other
2553 // characters cx where the case closure of cx incudes a string form that begins
2554 // with the original character c.
2555 //
2556 // This function could be made smarter. The full pattern string is available
2557 // and it would be possible to verify that the extra characters being added
2558 // to the starting set fully match, rather than having just a first-char of the
2559 // folded form match.
2560 //
2561 //------------------------------------------------------------------------------
2562 void RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) {
2563
2564 // Machine Generated below.
2565 // It may need updating with new versions of Unicode.
2566 // Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed.
2567 // The update tool is here: svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
2568
2569 // Machine Generated Data. Do not hand edit.
2570 static const UChar32 RECaseFixCodePoints[] = {
2571 0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc,
2572 0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565,
2573 0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07,
2574 0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61,
2575 0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000};
2576
2577 static const int16_t RECaseFixStringOffsets[] = {
2578 0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10,
2579 0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f,
2580 0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43,
2581 0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57,
2582 0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0};
2583
2584 static const int16_t RECaseFixCounts[] = {
2585 0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1,
2586 0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1,
2587 0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2588 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2589 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0};
2590
2591 static const UChar RECaseFixData[] = {
2592 0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf,
2593 0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3,
2594 0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3,
2595 0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0x1fe3,
2596 0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0xfb14,
2597 0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83,
2598 0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90,
2599 0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95,
2600 0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2,
2601 0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7,
2602 0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0};
2603
2604 // End of machine generated data.
2605
2606 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2607 UChar32 caseFoldedC = u_foldCase(c, U_FOLD_CASE_DEFAULT);
2608 starterChars->set(caseFoldedC, caseFoldedC);
2609
2610 int32_t i;
2611 for (i=0; RECaseFixCodePoints[i]<c ; i++) {
2612 // Simple linear search through the sorted list of interesting code points.
2613 }
2614
2615 if (RECaseFixCodePoints[i] == c) {
2616 int32_t dataIndex = RECaseFixStringOffsets[i];
2617 int32_t numCharsToAdd = RECaseFixCounts[i];
2618 UChar32 cpToAdd = 0;
2619 for (int32_t j=0; j<numCharsToAdd; j++) {
2620 U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd);
2621 starterChars->add(cpToAdd);
2622 }
2623 }
2624
2625 starterChars->closeOver(USET_CASE_INSENSITIVE);
2626 starterChars->removeAllStrings();
2627 } else {
2628 // Not a cased character. Just return it alone.
2629 starterChars->set(c, c);
2630 }
2631 }
2632
2633
2634
2635
2636 //------------------------------------------------------------------------------
2637 //
2638 // matchStartType Determine how a match can start.
2639 // Used to optimize find() operations.
2640 //
2641 // Operation is very similar to minMatchLength(). Walk the compiled
2642 // pattern, keeping an on-going minimum-match-length. For any
2643 // op where the min match coming in is zero, add that ops possible
2644 // starting matches to the possible starts for the overall pattern.
2645 //
2646 //------------------------------------------------------------------------------
2647 void RegexCompile::matchStartType() {
2648 if (U_FAILURE(*fStatus)) {
2649 return;
2650 }
2651
2652
2653 int32_t loc; // Location in the pattern of the current op being processed.
2654 int32_t op; // The op being processed
2655 int32_t opType; // The opcode type of the op
2656 int32_t currentLen = 0; // Minimum length of a match to this point (loc) in the pattern
2657 int32_t numInitialStrings = 0; // Number of strings encountered that could match at start.
2658
2659 UBool atStart = TRUE; // True if no part of the pattern yet encountered
2660 // could have advanced the position in a match.
2661 // (Maximum match length so far == 0)
2662
2663 // forwardedLength is a vector holding minimum-match-length values that
2664 // are propagated forward in the pattern by JMP or STATE_SAVE operations.
2665 // It must be one longer than the pattern being checked because some ops
2666 // will jmp to a end-of-block+1 location from within a block, and we must
2667 // count those when checking the block.
2668 int32_t end = fRXPat->fCompiledPat->size();
2669 UVector32 forwardedLength(end+1, *fStatus);
2670 forwardedLength.setSize(end+1);
2671 for (loc=3; loc<end; loc++) {
2672 forwardedLength.setElementAt(INT32_MAX, loc);
2673 }
2674
2675 for (loc = 3; loc<end; loc++) {
2676 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2677 opType = URX_TYPE(op);
2678
2679 // The loop is advancing linearly through the pattern.
2680 // If the op we are now at was the destination of a branch in the pattern,
2681 // and that path has a shorter minimum length than the current accumulated value,
2682 // replace the current accumulated value.
2683 if (forwardedLength.elementAti(loc) < currentLen) {
2684 currentLen = forwardedLength.elementAti(loc);
2685 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2686 }
2687
2688 switch (opType) {
2689 // Ops that don't change the total length matched
2690 case URX_RESERVED_OP:
2691 case URX_END:
2692 case URX_FAIL:
2693 case URX_STRING_LEN:
2694 case URX_NOP:
2695 case URX_START_CAPTURE:
2696 case URX_END_CAPTURE:
2697 case URX_BACKSLASH_B:
2698 case URX_BACKSLASH_BU:
2699 case URX_BACKSLASH_G:
2700 case URX_BACKSLASH_Z:
2701 case URX_DOLLAR:
2702 case URX_DOLLAR_M:
2703 case URX_DOLLAR_D:
2704 case URX_DOLLAR_MD:
2705 case URX_RELOC_OPRND:
2706 case URX_STO_INP_LOC:
2707 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
2708 case URX_BACKREF_I:
2709
2710 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
2711 case URX_LD_SP:
2712 break;
2713
2714 case URX_CARET:
2715 if (atStart) {
2716 fRXPat->fStartType = START_START;
2717 }
2718 break;
2719
2720 case URX_CARET_M:
2721 case URX_CARET_M_UNIX:
2722 if (atStart) {
2723 fRXPat->fStartType = START_LINE;
2724 }
2725 break;
2726
2727 case URX_ONECHAR:
2728 if (currentLen == 0) {
2729 // This character could appear at the start of a match.
2730 // Add it to the set of possible starting characters.
2731 fRXPat->fInitialChars->add(URX_VAL(op));
2732 numInitialStrings += 2;
2733 }
2734 currentLen++;
2735 atStart = FALSE;
2736 break;
2737
2738
2739 case URX_SETREF:
2740 if (currentLen == 0) {
2741 int32_t sn = URX_VAL(op);
2742 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2743 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2744 fRXPat->fInitialChars->addAll(*s);
2745 numInitialStrings += 2;
2746 }
2747 currentLen++;
2748 atStart = FALSE;
2749 break;
2750
2751 case URX_LOOP_SR_I:
2752 // [Set]*, like a SETREF, above, in what it can match,
2753 // but may not match at all, so currentLen is not incremented.
2754 if (currentLen == 0) {
2755 int32_t sn = URX_VAL(op);
2756 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2757 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2758 fRXPat->fInitialChars->addAll(*s);
2759 numInitialStrings += 2;
2760 }
2761 atStart = FALSE;
2762 break;
2763
2764 case URX_LOOP_DOT_I:
2765 if (currentLen == 0) {
2766 // .* at the start of a pattern.
2767 // Any character can begin the match.
2768 fRXPat->fInitialChars->clear();
2769 fRXPat->fInitialChars->complement();
2770 numInitialStrings += 2;
2771 }
2772 atStart = FALSE;
2773 break;
2774
2775
2776 case URX_STATIC_SETREF:
2777 if (currentLen == 0) {
2778 int32_t sn = URX_VAL(op);
2779 U_ASSERT(sn>0 && sn<URX_LAST_SET);
2780 const UnicodeSet *s = fRXPat->fStaticSets[sn];
2781 fRXPat->fInitialChars->addAll(*s);
2782 numInitialStrings += 2;
2783 }
2784 currentLen++;
2785 atStart = FALSE;
2786 break;
2787
2788
2789
2790 case URX_STAT_SETREF_N:
2791 if (currentLen == 0) {
2792 int32_t sn = URX_VAL(op);
2793 const UnicodeSet *s = fRXPat->fStaticSets[sn];
2794 UnicodeSet sc(*s);
2795 sc.complement();
2796 fRXPat->fInitialChars->addAll(sc);
2797 numInitialStrings += 2;
2798 }
2799 currentLen++;
2800 atStart = FALSE;
2801 break;
2802
2803
2804
2805 case URX_BACKSLASH_D:
2806 // Digit Char
2807 if (currentLen == 0) {
2808 UnicodeSet s;
2809 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
2810 if (URX_VAL(op) != 0) {
2811 s.complement();
2812 }
2813 fRXPat->fInitialChars->addAll(s);
2814 numInitialStrings += 2;
2815 }
2816 currentLen++;
2817 atStart = FALSE;
2818 break;
2819
2820
2821 case URX_BACKSLASH_H:
2822 // Horiz white space
2823 if (currentLen == 0) {
2824 UnicodeSet s;
2825 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
2826 s.add((UChar32)9); // Tab
2827 if (URX_VAL(op) != 0) {
2828 s.complement();
2829 }
2830 fRXPat->fInitialChars->addAll(s);
2831 numInitialStrings += 2;
2832 }
2833 currentLen++;
2834 atStart = FALSE;
2835 break;
2836
2837
2838 case URX_BACKSLASH_R: // Any line ending sequence
2839 case URX_BACKSLASH_V: // Any line ending code point, with optional negation
2840 if (currentLen == 0) {
2841 UnicodeSet s;
2842 s.add((UChar32)0x0a, (UChar32)0x0d); // add range
2843 s.add((UChar32)0x85);
2844 s.add((UChar32)0x2028, (UChar32)0x2029);
2845 if (URX_VAL(op) != 0) {
2846 // Complement option applies to URX_BACKSLASH_V only.
2847 s.complement();
2848 }
2849 fRXPat->fInitialChars->addAll(s);
2850 numInitialStrings += 2;
2851 }
2852 currentLen++;
2853 atStart = FALSE;
2854 break;
2855
2856
2857
2858 case URX_ONECHAR_I:
2859 // Case Insensitive Single Character.
2860 if (currentLen == 0) {
2861 UChar32 c = URX_VAL(op);
2862 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2863 UnicodeSet starters(c, c);
2864 starters.closeOver(USET_CASE_INSENSITIVE);
2865 // findCaseInsensitiveStarters(c, &starters);
2866 // For ONECHAR_I, no need to worry about text chars that expand on folding into strings.
2867 // The expanded folding can't match the pattern.
2868 fRXPat->fInitialChars->addAll(starters);
2869 } else {
2870 // Char has no case variants. Just add it as-is to the
2871 // set of possible starting chars.
2872 fRXPat->fInitialChars->add(c);
2873 }
2874 numInitialStrings += 2;
2875 }
2876 currentLen++;
2877 atStart = FALSE;
2878 break;
2879
2880
2881 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
2882 case URX_DOTANY_ALL: // . matches one or two.
2883 case URX_DOTANY:
2884 case URX_DOTANY_UNIX:
2885 if (currentLen == 0) {
2886 // These constructs are all bad news when they appear at the start
2887 // of a match. Any character can begin the match.
2888 fRXPat->fInitialChars->clear();
2889 fRXPat->fInitialChars->complement();
2890 numInitialStrings += 2;
2891 }
2892 currentLen++;
2893 atStart = FALSE;
2894 break;
2895
2896
2897 case URX_JMPX:
2898 loc++; // Except for extra operand on URX_JMPX, same as URX_JMP.
2899 U_FALLTHROUGH;
2900 case URX_JMP:
2901 {
2902 int32_t jmpDest = URX_VAL(op);
2903 if (jmpDest < loc) {
2904 // Loop of some kind. Can safely ignore, the worst that will happen
2905 // is that we understate the true minimum length
2906 currentLen = forwardedLength.elementAti(loc+1);
2907
2908 } else {
2909 // Forward jump. Propagate the current min length to the target loc of the jump.
2910 U_ASSERT(jmpDest <= end+1);
2911 if (forwardedLength.elementAti(jmpDest) > currentLen) {
2912 forwardedLength.setElementAt(currentLen, jmpDest);
2913 }
2914 }
2915 }
2916 atStart = FALSE;
2917 break;
2918
2919 case URX_JMP_SAV:
2920 case URX_JMP_SAV_X:
2921 // Combo of state save to the next loc, + jmp backwards.
2922 // Net effect on min. length computation is nothing.
2923 atStart = FALSE;
2924 break;
2925
2926 case URX_BACKTRACK:
2927 // Fails are kind of like a branch, except that the min length was
2928 // propagated already, by the state save.
2929 currentLen = forwardedLength.elementAti(loc+1);
2930 atStart = FALSE;
2931 break;
2932
2933
2934 case URX_STATE_SAVE:
2935 {
2936 // State Save, for forward jumps, propagate the current minimum.
2937 // of the state save.
2938 int32_t jmpDest = URX_VAL(op);
2939 if (jmpDest > loc) {
2940 if (currentLen < forwardedLength.elementAti(jmpDest)) {
2941 forwardedLength.setElementAt(currentLen, jmpDest);
2942 }
2943 }
2944 }
2945 atStart = FALSE;
2946 break;
2947
2948
2949
2950
2951 case URX_STRING:
2952 {
2953 loc++;
2954 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2955 int32_t stringLen = URX_VAL(stringLenOp);
2956 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
2957 U_ASSERT(stringLenOp >= 2);
2958 if (currentLen == 0) {
2959 // Add the starting character of this string to the set of possible starting
2960 // characters for this pattern.
2961 int32_t stringStartIdx = URX_VAL(op);
2962 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
2963 fRXPat->fInitialChars->add(c);
2964
2965 // Remember this string. After the entire pattern has been checked,
2966 // if nothing else is identified that can start a match, we'll use it.
2967 numInitialStrings++;
2968 fRXPat->fInitialStringIdx = stringStartIdx;
2969 fRXPat->fInitialStringLen = stringLen;
2970 }
2971
2972 currentLen += stringLen;
2973 atStart = FALSE;
2974 }
2975 break;
2976
2977 case URX_STRING_I:
2978 {
2979 // Case-insensitive string. Unlike exact-match strings, we won't
2980 // attempt a string search for possible match positions. But we
2981 // do update the set of possible starting characters.
2982 loc++;
2983 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2984 int32_t stringLen = URX_VAL(stringLenOp);
2985 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
2986 U_ASSERT(stringLenOp >= 2);
2987 if (currentLen == 0) {
2988 // Add the starting character of this string to the set of possible starting
2989 // characters for this pattern.
2990 int32_t stringStartIdx = URX_VAL(op);
2991 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
2992 UnicodeSet s;
2993 findCaseInsensitiveStarters(c, &s);
2994 fRXPat->fInitialChars->addAll(s);
2995 numInitialStrings += 2; // Matching on an initial string not possible.
2996 }
2997 currentLen += stringLen;
2998 atStart = FALSE;
2999 }
3000 break;
3001
3002 case URX_CTR_INIT:
3003 case URX_CTR_INIT_NG:
3004 {
3005 // Loop Init Ops. These don't change the min length, but they are 4 word ops
3006 // so location must be updated accordingly.
3007 // Loop Init Ops.
3008 // If the min loop count == 0
3009 // move loc forwards to the end of the loop, skipping over the body.
3010 // If the min count is > 0,
3011 // continue normal processing of the body of the loop.
3012 int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3013 loopEndLoc = URX_VAL(loopEndLoc);
3014 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3015 if (minLoopCount == 0) {
3016 // Min Loop Count of 0, treat like a forward branch and
3017 // move the current minimum length up to the target
3018 // (end of loop) location.
3019 U_ASSERT(loopEndLoc <= end+1);
3020 if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
3021 forwardedLength.setElementAt(currentLen, loopEndLoc);
3022 }
3023 }
3024 loc+=3; // Skips over operands of CTR_INIT
3025 }
3026 atStart = FALSE;
3027 break;
3028
3029
3030 case URX_CTR_LOOP:
3031 case URX_CTR_LOOP_NG:
3032 // Loop ops.
3033 // The jump is conditional, backwards only.
3034 atStart = FALSE;
3035 break;
3036
3037 case URX_LOOP_C:
3038 // More loop ops. These state-save to themselves.
3039 // don't change the minimum match
3040 atStart = FALSE;
3041 break;
3042
3043
3044 case URX_LA_START:
3045 case URX_LB_START:
3046 {
3047 // Look-around. Scan forward until the matching look-ahead end,
3048 // without processing the look-around block. This is overly pessimistic.
3049
3050 // Keep track of the nesting depth of look-around blocks. Boilerplate code for
3051 // lookahead contains two LA_END instructions, so count goes up by two
3052 // for each LA_START.
3053 int32_t depth = (opType == URX_LA_START? 2: 1);
3054 for (;;) {
3055 loc++;
3056 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3057 if (URX_TYPE(op) == URX_LA_START) {
3058 depth+=2;
3059 }
3060 if (URX_TYPE(op) == URX_LB_START) {
3061 depth++;
3062 }
3063 if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
3064 depth--;
3065 if (depth == 0) {
3066 break;
3067 }
3068 }
3069 if (URX_TYPE(op) == URX_STATE_SAVE) {
3070 // Need this because neg lookahead blocks will FAIL to outside
3071 // of the block.
3072 int32_t jmpDest = URX_VAL(op);
3073 if (jmpDest > loc) {
3074 if (currentLen < forwardedLength.elementAti(jmpDest)) {
3075 forwardedLength.setElementAt(currentLen, jmpDest);
3076 }
3077 }
3078 }
3079 U_ASSERT(loc <= end);
3080 }
3081 }
3082 break;
3083
3084 case URX_LA_END:
3085 case URX_LB_CONT:
3086 case URX_LB_END:
3087 case URX_LBN_CONT:
3088 case URX_LBN_END:
3089 U_ASSERT(FALSE); // Shouldn't get here. These ops should be
3090 // consumed by the scan in URX_LA_START and LB_START
3091
3092 break;
3093
3094 default:
3095 U_ASSERT(FALSE);
3096 }
3097
3098 }
3099
3100
3101 // We have finished walking through the ops. Check whether some forward jump
3102 // propagated a shorter length to location end+1.
3103 if (forwardedLength.elementAti(end+1) < currentLen) {
3104 currentLen = forwardedLength.elementAti(end+1);
3105 }
3106
3107
3108 fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
3109
3110
3111 // Sort out what we should check for when looking for candidate match start positions.
3112 // In order of preference,
3113 // 1. Start of input text buffer.
3114 // 2. A literal string.
3115 // 3. Start of line in multi-line mode.
3116 // 4. A single literal character.
3117 // 5. A character from a set of characters.
3118 //
3119 if (fRXPat->fStartType == START_START) {
3120 // Match only at the start of an input text string.
3121 // start type is already set. We're done.
3122 } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) {
3123 // Match beginning only with a literal string.
3124 UChar32 c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
3125 U_ASSERT(fRXPat->fInitialChars->contains(c));
3126 fRXPat->fStartType = START_STRING;
3127 fRXPat->fInitialChar = c;
3128 } else if (fRXPat->fStartType == START_LINE) {
3129 // Match at start of line in Multi-Line mode.
3130 // Nothing to do here; everything is already set.
3131 } else if (fRXPat->fMinMatchLen == 0) {
3132 // Zero length match possible. We could start anywhere.
3133 fRXPat->fStartType = START_NO_INFO;
3134 } else if (fRXPat->fInitialChars->size() == 1) {
3135 // All matches begin with the same char.
3136 fRXPat->fStartType = START_CHAR;
3137 fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
3138 U_ASSERT(fRXPat->fInitialChar != (UChar32)-1);
3139 } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE &&
3140 fRXPat->fMinMatchLen > 0) {
3141 // Matches start with a set of character smaller than the set of all chars.
3142 fRXPat->fStartType = START_SET;
3143 } else {
3144 // Matches can start with anything
3145 fRXPat->fStartType = START_NO_INFO;
3146 }
3147
3148 return;
3149 }
3150
3151
3152
3153 //------------------------------------------------------------------------------
3154 //
3155 // minMatchLength Calculate the length of the shortest string that could
3156 // match the specified pattern.
3157 // Length is in 16 bit code units, not code points.
3158 //
3159 // The calculated length may not be exact. The returned
3160 // value may be shorter than the actual minimum; it must
3161 // never be longer.
3162 //
3163 // start and end are the range of p-code operations to be
3164 // examined. The endpoints are included in the range.
3165 //
3166 //------------------------------------------------------------------------------
3167 int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
3168 if (U_FAILURE(*fStatus)) {
3169 return 0;
3170 }
3171
3172 U_ASSERT(start <= end);
3173 U_ASSERT(end < fRXPat->fCompiledPat->size());
3174
3175
3176 int32_t loc;
3177 int32_t op;
3178 int32_t opType;
3179 int32_t currentLen = 0;
3180
3181
3182 // forwardedLength is a vector holding minimum-match-length values that
3183 // are propagated forward in the pattern by JMP or STATE_SAVE operations.
3184 // It must be one longer than the pattern being checked because some ops
3185 // will jmp to a end-of-block+1 location from within a block, and we must
3186 // count those when checking the block.
3187 UVector32 forwardedLength(end+2, *fStatus);
3188 forwardedLength.setSize(end+2);
3189 for (loc=start; loc<=end+1; loc++) {
3190 forwardedLength.setElementAt(INT32_MAX, loc);
3191 }
3192
3193 for (loc = start; loc<=end; loc++) {
3194 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3195 opType = URX_TYPE(op);
3196
3197 // The loop is advancing linearly through the pattern.
3198 // If the op we are now at was the destination of a branch in the pattern,
3199 // and that path has a shorter minimum length than the current accumulated value,
3200 // replace the current accumulated value.
3201 // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); // MinLength == INT32_MAX for some
3202 // no-match-possible cases.
3203 if (forwardedLength.elementAti(loc) < currentLen) {
3204 currentLen = forwardedLength.elementAti(loc);
3205 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3206 }
3207
3208 switch (opType) {
3209 // Ops that don't change the total length matched
3210 case URX_RESERVED_OP:
3211 case URX_END:
3212 case URX_STRING_LEN:
3213 case URX_NOP:
3214 case URX_START_CAPTURE:
3215 case URX_END_CAPTURE:
3216 case URX_BACKSLASH_B:
3217 case URX_BACKSLASH_BU:
3218 case URX_BACKSLASH_G:
3219 case URX_BACKSLASH_Z:
3220 case URX_CARET:
3221 case URX_DOLLAR:
3222 case URX_DOLLAR_M:
3223 case URX_DOLLAR_D:
3224 case URX_DOLLAR_MD:
3225 case URX_RELOC_OPRND:
3226 case URX_STO_INP_LOC:
3227 case URX_CARET_M:
3228 case URX_CARET_M_UNIX:
3229 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
3230 case URX_BACKREF_I:
3231
3232 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
3233 case URX_LD_SP:
3234
3235 case URX_JMP_SAV:
3236 case URX_JMP_SAV_X:
3237 break;
3238
3239
3240 // Ops that match a minimum of one character (one or two 16 bit code units.)
3241 //
3242 case URX_ONECHAR:
3243 case URX_STATIC_SETREF:
3244 case URX_STAT_SETREF_N:
3245 case URX_SETREF:
3246 case URX_BACKSLASH_D:
3247 case URX_BACKSLASH_H:
3248 case URX_BACKSLASH_R:
3249 case URX_BACKSLASH_V:
3250 case URX_ONECHAR_I:
3251 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
3252 case URX_DOTANY_ALL: // . matches one or two.
3253 case URX_DOTANY:
3254 case URX_DOTANY_UNIX:
3255 currentLen++;
3256 break;
3257
3258
3259 case URX_JMPX:
3260 loc++; // URX_JMPX has an extra operand, ignored here,
3261 // otherwise processed identically to URX_JMP.
3262 U_FALLTHROUGH;
3263 case URX_JMP:
3264 {
3265 int32_t jmpDest = URX_VAL(op);
3266 if (jmpDest < loc) {
3267 // Loop of some kind. Can safely ignore, the worst that will happen
3268 // is that we understate the true minimum length
3269 currentLen = forwardedLength.elementAti(loc+1);
3270 } else {
3271 // Forward jump. Propagate the current min length to the target loc of the jump.
3272 U_ASSERT(jmpDest <= end+1);
3273 if (forwardedLength.elementAti(jmpDest) > currentLen) {
3274 forwardedLength.setElementAt(currentLen, jmpDest);
3275 }
3276 }
3277 }
3278 break;
3279
3280 case URX_BACKTRACK:
3281 {
3282 // Back-tracks are kind of like a branch, except that the min length was
3283 // propagated already, by the state save.
3284 currentLen = forwardedLength.elementAti(loc+1);
3285 }
3286 break;
3287
3288
3289 case URX_STATE_SAVE:
3290 {
3291 // State Save, for forward jumps, propagate the current minimum.
3292 // of the state save.
3293 int32_t jmpDest = URX_VAL(op);
3294 if (jmpDest > loc) {
3295 if (currentLen < forwardedLength.elementAti(jmpDest)) {
3296 forwardedLength.setElementAt(currentLen, jmpDest);
3297 }
3298 }
3299 }
3300 break;
3301
3302
3303 case URX_STRING:
3304 {
3305 loc++;
3306 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3307 currentLen += URX_VAL(stringLenOp);
3308 }
3309 break;
3310
3311
3312 case URX_STRING_I:
3313 {
3314 loc++;
3315 // TODO: with full case folding, matching input text may be shorter than
3316 // the string we have here. More smarts could put some bounds on it.
3317 // Assume a min length of one for now. A min length of zero causes
3318 // optimization failures for a pattern like "string"+
3319 // currentLen += URX_VAL(stringLenOp);
3320 currentLen += 1;
3321 }
3322 break;
3323
3324 case URX_CTR_INIT:
3325 case URX_CTR_INIT_NG:
3326 {
3327 // Loop Init Ops.
3328 // If the min loop count == 0
3329 // move loc forwards to the end of the loop, skipping over the body.
3330 // If the min count is > 0,
3331 // continue normal processing of the body of the loop.
3332 int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3333 loopEndLoc = URX_VAL(loopEndLoc);
3334 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3335 if (minLoopCount == 0) {
3336 loc = loopEndLoc;
3337 } else {
3338 loc+=3; // Skips over operands of CTR_INIT
3339 }
3340 }
3341 break;
3342
3343
3344 case URX_CTR_LOOP:
3345 case URX_CTR_LOOP_NG:
3346 // Loop ops.
3347 // The jump is conditional, backwards only.
3348 break;
3349
3350 case URX_LOOP_SR_I:
3351 case URX_LOOP_DOT_I:
3352 case URX_LOOP_C:
3353 // More loop ops. These state-save to themselves.
3354 // don't change the minimum match - could match nothing at all.
3355 break;
3356
3357
3358 case URX_LA_START:
3359 case URX_LB_START:
3360 {
3361 // Look-around. Scan forward until the matching look-ahead end,
3362 // without processing the look-around block. This is overly pessimistic for look-ahead,
3363 // it assumes that the look-ahead match might be zero-length.
3364 // TODO: Positive lookahead could recursively do the block, then continue
3365 // with the longer of the block or the value coming in. Ticket 6060
3366 int32_t depth = (opType == URX_LA_START? 2: 1);;
3367 for (;;) {
3368 loc++;
3369 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3370 if (URX_TYPE(op) == URX_LA_START) {
3371 // The boilerplate for look-ahead includes two LA_END insturctions,
3372 // Depth will be decremented by each one when it is seen.
3373 depth += 2;
3374 }
3375 if (URX_TYPE(op) == URX_LB_START) {
3376 depth++;
3377 }
3378 if (URX_TYPE(op) == URX_LA_END) {
3379 depth--;
3380 if (depth == 0) {
3381 break;
3382 }
3383 }
3384 if (URX_TYPE(op)==URX_LBN_END) {
3385 depth--;
3386 if (depth == 0) {
3387 break;
3388 }
3389 }
3390 if (URX_TYPE(op) == URX_STATE_SAVE) {
3391 // Need this because neg lookahead blocks will FAIL to outside
3392 // of the block.
3393 int32_t jmpDest = URX_VAL(op);
3394 if (jmpDest > loc) {
3395 if (currentLen < forwardedLength.elementAti(jmpDest)) {
3396 forwardedLength.setElementAt(currentLen, jmpDest);
3397 }
3398 }
3399 }
3400 U_ASSERT(loc <= end);
3401 }
3402 }
3403 break;
3404
3405 case URX_LA_END:
3406 case URX_LB_CONT:
3407 case URX_LB_END:
3408 case URX_LBN_CONT:
3409 case URX_LBN_END:
3410 // Only come here if the matching URX_LA_START or URX_LB_START was not in the
3411 // range being sized, which happens when measuring size of look-behind blocks.
3412 break;
3413
3414 default:
3415 U_ASSERT(FALSE);
3416 }
3417
3418 }
3419
3420 // We have finished walking through the ops. Check whether some forward jump
3421 // propagated a shorter length to location end+1.
3422 if (forwardedLength.elementAti(end+1) < currentLen) {
3423 currentLen = forwardedLength.elementAti(end+1);
3424 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3425 }
3426
3427 return currentLen;
3428 }
3429
3430 // Increment with overflow check.
3431 // val and delta will both be positive.
3432
3433 static int32_t safeIncrement(int32_t val, int32_t delta) {
3434 if (INT32_MAX - val > delta) {
3435 return val + delta;
3436 } else {
3437 return INT32_MAX;
3438 }
3439 }
3440
3441
3442 //------------------------------------------------------------------------------
3443 //
3444 // maxMatchLength Calculate the length of the longest string that could
3445 // match the specified pattern.
3446 // Length is in 16 bit code units, not code points.
3447 //
3448 // The calculated length may not be exact. The returned
3449 // value may be longer than the actual maximum; it must
3450 // never be shorter.
3451 //
3452 //------------------------------------------------------------------------------
3453 int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
3454 if (U_FAILURE(*fStatus)) {
3455 return 0;
3456 }
3457 U_ASSERT(start <= end);
3458 U_ASSERT(end < fRXPat->fCompiledPat->size());
3459
3460
3461 int32_t loc;
3462 int32_t op;
3463 int32_t opType;
3464 int32_t currentLen = 0;
3465 UVector32 forwardedLength(end+1, *fStatus);
3466 forwardedLength.setSize(end+1);
3467
3468 for (loc=start; loc<=end; loc++) {
3469 forwardedLength.setElementAt(0, loc);
3470 }
3471
3472 for (loc = start; loc<=end; loc++) {
3473 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3474 opType = URX_TYPE(op);
3475
3476 // The loop is advancing linearly through the pattern.
3477 // If the op we are now at was the destination of a branch in the pattern,
3478 // and that path has a longer maximum length than the current accumulated value,
3479 // replace the current accumulated value.
3480 if (forwardedLength.elementAti(loc) > currentLen) {
3481 currentLen = forwardedLength.elementAti(loc);
3482 }
3483
3484 switch (opType) {
3485 // Ops that don't change the total length matched
3486 case URX_RESERVED_OP:
3487 case URX_END:
3488 case URX_STRING_LEN:
3489 case URX_NOP:
3490 case URX_START_CAPTURE:
3491 case URX_END_CAPTURE:
3492 case URX_BACKSLASH_B:
3493 case URX_BACKSLASH_BU:
3494 case URX_BACKSLASH_G:
3495 case URX_BACKSLASH_Z:
3496 case URX_CARET:
3497 case URX_DOLLAR:
3498 case URX_DOLLAR_M:
3499 case URX_DOLLAR_D:
3500 case URX_DOLLAR_MD:
3501 case URX_RELOC_OPRND:
3502 case URX_STO_INP_LOC:
3503 case URX_CARET_M:
3504 case URX_CARET_M_UNIX:
3505
3506 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
3507 case URX_LD_SP:
3508
3509 case URX_LB_END:
3510 case URX_LB_CONT:
3511 case URX_LBN_CONT:
3512 case URX_LBN_END:
3513 break;
3514
3515
3516 // Ops that increase that cause an unbounded increase in the length
3517 // of a matched string, or that increase it a hard to characterize way.
3518 // Call the max length unbounded, and stop further checking.
3519 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
3520 case URX_BACKREF_I:
3521 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
3522 currentLen = INT32_MAX;
3523 break;
3524
3525
3526 // Ops that match a max of one character (possibly two 16 bit code units.)
3527 //
3528 case URX_STATIC_SETREF:
3529 case URX_STAT_SETREF_N:
3530 case URX_SETREF:
3531 case URX_BACKSLASH_D:
3532 case URX_BACKSLASH_H:
3533 case URX_BACKSLASH_R:
3534 case URX_BACKSLASH_V:
3535 case URX_ONECHAR_I:
3536 case URX_DOTANY_ALL:
3537 case URX_DOTANY:
3538 case URX_DOTANY_UNIX:
3539 currentLen = safeIncrement(currentLen, 2);
3540 break;
3541
3542 // Single literal character. Increase current max length by one or two,
3543 // depending on whether the char is in the supplementary range.
3544 case URX_ONECHAR:
3545 currentLen = safeIncrement(currentLen, 1);
3546 if (URX_VAL(op) > 0x10000) {
3547 currentLen = safeIncrement(currentLen, 1);
3548 }
3549 break;
3550
3551 // Jumps.
3552 //
3553 case URX_JMP:
3554 case URX_JMPX:
3555 case URX_JMP_SAV:
3556 case URX_JMP_SAV_X:
3557 {
3558 int32_t jmpDest = URX_VAL(op);
3559 if (jmpDest < loc) {
3560 // Loop of some kind. Max match length is unbounded.
3561 currentLen = INT32_MAX;
3562 } else {
3563 // Forward jump. Propagate the current min length to the target loc of the jump.
3564 if (forwardedLength.elementAti(jmpDest) < currentLen) {
3565 forwardedLength.setElementAt(currentLen, jmpDest);
3566 }
3567 currentLen = 0;
3568 }
3569 }
3570 break;
3571
3572 case URX_BACKTRACK:
3573 // back-tracks are kind of like a branch, except that the max length was
3574 // propagated already, by the state save.
3575 currentLen = forwardedLength.elementAti(loc+1);
3576 break;
3577
3578
3579 case URX_STATE_SAVE:
3580 {
3581 // State Save, for forward jumps, propagate the current minimum.
3582 // of the state save.
3583 // For backwards jumps, they create a loop, maximum
3584 // match length is unbounded.
3585 int32_t jmpDest = URX_VAL(op);
3586 if (jmpDest > loc) {
3587 if (currentLen > forwardedLength.elementAti(jmpDest)) {
3588 forwardedLength.setElementAt(currentLen, jmpDest);
3589 }
3590 } else {
3591 currentLen = INT32_MAX;
3592 }
3593 }
3594 break;
3595
3596
3597
3598
3599 case URX_STRING:
3600 {
3601 loc++;
3602 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3603 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3604 break;
3605 }
3606
3607 case URX_STRING_I:
3608 // TODO: This code assumes that any user string that matches will be no longer
3609 // than our compiled string, with case insensitive matching.
3610 // Our compiled string has been case-folded already.
3611 //
3612 // Any matching user string will have no more code points than our
3613 // compiled (folded) string. Folding may add code points, but
3614 // not remove them.
3615 //
3616 // There is a potential problem if a supplemental code point
3617 // case-folds to a BMP code point. In this case our compiled string
3618 // could be shorter (in code units) than a matching user string.
3619 //
3620 // At this time (Unicode 6.1) there are no such characters, and this case
3621 // is not being handled. A test, intltest regex/Bug9283, will fail if
3622 // any problematic characters are added to Unicode.
3623 //
3624 // If this happens, we can make a set of the BMP chars that the
3625 // troublesome supplementals fold to, scan our string, and bump the
3626 // currentLen one extra for each that is found.
3627 //
3628 {
3629 loc++;
3630 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3631 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3632 }
3633 break;
3634
3635 case URX_CTR_INIT:
3636 case URX_CTR_INIT_NG:
3637 // For Loops, recursively call this function on the pattern for the loop body,
3638 // then multiply the result by the maximum loop count.
3639 {
3640 int32_t loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1));
3641 if (loopEndLoc == loc+4) {
3642 // Loop has an empty body. No affect on max match length.
3643 // Continue processing with code after the loop end.
3644 loc = loopEndLoc;
3645 break;
3646 }
3647
3648 int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc+3));
3649 if (maxLoopCount == -1) {
3650 // Unbounded Loop. No upper bound on match length.
3651 currentLen = INT32_MAX;
3652 break;
3653 }
3654
3655 U_ASSERT(loopEndLoc >= loc+4);
3656 int64_t blockLen = maxMatchLength(loc+4, loopEndLoc-1); // Recursive call.
3657 int64_t updatedLen = (int64_t)currentLen + blockLen * maxLoopCount;
3658 if (updatedLen >= INT32_MAX) {
3659 currentLen = INT32_MAX;
3660 break;
3661 }
3662 currentLen = (int32_t)updatedLen;
3663 loc = loopEndLoc;
3664 break;
3665 }
3666
3667 case URX_CTR_LOOP:
3668 case URX_CTR_LOOP_NG:
3669 // These opcodes will be skipped over by code for URX_CRT_INIT.
3670 // We shouldn't encounter them here.
3671 U_ASSERT(FALSE);
3672 break;
3673
3674 case URX_LOOP_SR_I:
3675 case URX_LOOP_DOT_I:
3676 case URX_LOOP_C:
3677 // For anything to do with loops, make the match length unbounded.
3678 currentLen = INT32_MAX;
3679 break;
3680
3681
3682
3683 case URX_LA_START:
3684 case URX_LA_END:
3685 // Look-ahead. Just ignore, treat the look-ahead block as if
3686 // it were normal pattern. Gives a too-long match length,
3687 // but good enough for now.
3688 break;
3689
3690 // End of look-ahead ops should always be consumed by the processing at
3691 // the URX_LA_START op.
3692 // U_ASSERT(FALSE);
3693 // break;
3694
3695 case URX_LB_START:
3696 {
3697 // Look-behind. Scan forward until the matching look-around end,
3698 // without processing the look-behind block.
3699 int32_t depth = 0;
3700 for (;;) {
3701 loc++;
3702 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3703 if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) {
3704 depth++;
3705 }
3706 if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
3707 if (depth == 0) {
3708 break;
3709 }
3710 depth--;
3711 }
3712 U_ASSERT(loc < end);
3713 }
3714 }
3715 break;
3716
3717 default:
3718 U_ASSERT(FALSE);
3719 }
3720
3721
3722 if (currentLen == INT32_MAX) {
3723 // The maximum length is unbounded.
3724 // Stop further processing of the pattern.
3725 break;
3726 }
3727
3728 }
3729 return currentLen;
3730
3731 }
3732
3733
3734 //------------------------------------------------------------------------------
3735 //
3736 // stripNOPs Remove any NOP operations from the compiled pattern code.
3737 // Extra NOPs are inserted for some constructs during the initial
3738 // code generation to provide locations that may be patched later.
3739 // Many end up unneeded, and are removed by this function.
3740 //
3741 // In order to minimize the number of passes through the pattern,
3742 // back-reference fixup is also performed here (adjusting
3743 // back-reference operands to point to the correct frame offsets).
3744 //
3745 //------------------------------------------------------------------------------
3746 void RegexCompile::stripNOPs() {
3747
3748 if (U_FAILURE(*fStatus)) {
3749 return;
3750 }
3751
3752 int32_t end = fRXPat->fCompiledPat->size();
3753 UVector32 deltas(end, *fStatus);
3754
3755 // Make a first pass over the code, computing the amount that things
3756 // will be offset at each location in the original code.
3757 int32_t loc;
3758 int32_t d = 0;
3759 for (loc=0; loc<end; loc++) {
3760 deltas.addElement(d, *fStatus);
3761 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3762 if (URX_TYPE(op) == URX_NOP) {
3763 d++;
3764 }
3765 }
3766
3767 UnicodeString caseStringBuffer;
3768
3769 // Make a second pass over the code, removing the NOPs by moving following
3770 // code up, and patching operands that refer to code locations that
3771 // are being moved. The array of offsets from the first step is used
3772 // to compute the new operand values.
3773 int32_t src;
3774 int32_t dst = 0;
3775 for (src=0; src<end; src++) {
3776 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src);
3777 int32_t opType = URX_TYPE(op);
3778 switch (opType) {
3779 case URX_NOP:
3780 break;
3781
3782 case URX_STATE_SAVE:
3783 case URX_JMP:
3784 case URX_CTR_LOOP:
3785 case URX_CTR_LOOP_NG:
3786 case URX_RELOC_OPRND:
3787 case URX_JMPX:
3788 case URX_JMP_SAV:
3789 case URX_JMP_SAV_X:
3790 // These are instructions with operands that refer to code locations.
3791 {
3792 int32_t operandAddress = URX_VAL(op);
3793 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size());
3794 int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
3795 op = buildOp(opType, fixedOperandAddress);
3796 fRXPat->fCompiledPat->setElementAt(op, dst);
3797 dst++;
3798 break;
3799 }
3800
3801 case URX_BACKREF:
3802 case URX_BACKREF_I:
3803 {
3804 int32_t where = URX_VAL(op);
3805 if (where > fRXPat->fGroupMap->size()) {
3806 error(U_REGEX_INVALID_BACK_REF);
3807 break;
3808 }
3809 where = fRXPat->fGroupMap->elementAti(where-1);
3810 op = buildOp(opType, where);
3811 fRXPat->fCompiledPat->setElementAt(op, dst);
3812 dst++;
3813
3814 fRXPat->fNeedsAltInput = TRUE;
3815 break;
3816 }
3817 case URX_RESERVED_OP:
3818 case URX_RESERVED_OP_N:
3819 case URX_BACKTRACK:
3820 case URX_END:
3821 case URX_ONECHAR:
3822 case URX_STRING:
3823 case URX_STRING_LEN:
3824 case URX_START_CAPTURE:
3825 case URX_END_CAPTURE:
3826 case URX_STATIC_SETREF:
3827 case URX_STAT_SETREF_N:
3828 case URX_SETREF:
3829 case URX_DOTANY:
3830 case URX_FAIL:
3831 case URX_BACKSLASH_B:
3832 case URX_BACKSLASH_BU:
3833 case URX_BACKSLASH_G:
3834 case URX_BACKSLASH_X:
3835 case URX_BACKSLASH_Z:
3836 case URX_DOTANY_ALL:
3837 case URX_BACKSLASH_D:
3838 case URX_CARET:
3839 case URX_DOLLAR:
3840 case URX_CTR_INIT:
3841 case URX_CTR_INIT_NG:
3842 case URX_DOTANY_UNIX:
3843 case URX_STO_SP:
3844 case URX_LD_SP:
3845 case URX_STO_INP_LOC:
3846 case URX_LA_START:
3847 case URX_LA_END:
3848 case URX_ONECHAR_I:
3849 case URX_STRING_I:
3850 case URX_DOLLAR_M:
3851 case URX_CARET_M:
3852 case URX_CARET_M_UNIX:
3853 case URX_LB_START:
3854 case URX_LB_CONT:
3855 case URX_LB_END:
3856 case URX_LBN_CONT:
3857 case URX_LBN_END:
3858 case URX_LOOP_SR_I:
3859 case URX_LOOP_DOT_I:
3860 case URX_LOOP_C:
3861 case URX_DOLLAR_D:
3862 case URX_DOLLAR_MD:
3863 case URX_BACKSLASH_H:
3864 case URX_BACKSLASH_R:
3865 case URX_BACKSLASH_V:
3866 // These instructions are unaltered by the relocation.
3867 fRXPat->fCompiledPat->setElementAt(op, dst);
3868 dst++;
3869 break;
3870
3871 default:
3872 // Some op is unaccounted for.
3873 U_ASSERT(FALSE);
3874 error(U_REGEX_INTERNAL_ERROR);
3875 }
3876 }
3877
3878 fRXPat->fCompiledPat->setSize(dst);
3879 }
3880
3881
3882
3883
3884 //------------------------------------------------------------------------------
3885 //
3886 // Error Report a rule parse error.
3887 // Only report it if no previous error has been recorded.
3888 //
3889 //------------------------------------------------------------------------------
3890 void RegexCompile::error(UErrorCode e) {
3891 if (U_SUCCESS(*fStatus)) {
3892 *fStatus = e;
3893 // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
3894 // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
3895 // int64_t. If the values of the latter are out of range for the former,
3896 // set them to the appropriate "field not supported" values.
3897 if (fLineNum > 0x7FFFFFFF) {
3898 fParseErr->line = 0;
3899 fParseErr->offset = -1;
3900 } else if (fCharNum > 0x7FFFFFFF) {
3901 fParseErr->line = (int32_t)fLineNum;
3902 fParseErr->offset = -1;
3903 } else {
3904 fParseErr->line = (int32_t)fLineNum;
3905 fParseErr->offset = (int32_t)fCharNum;
3906 }
3907
3908 UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context
3909
3910 // Fill in the context.
3911 // Note: extractBetween() pins supplied indicies to the string bounds.
3912 uprv_memset(fParseErr->preContext, 0, sizeof(fParseErr->preContext));
3913 uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
3914 utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
3915 utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
3916 }
3917 }
3918
3919
3920 //
3921 // Assorted Unicode character constants.
3922 // Numeric because there is no portable way to enter them as literals.
3923 // (Think EBCDIC).
3924 //
3925 static const UChar chCR = 0x0d; // New lines, for terminating comments.
3926 static const UChar chLF = 0x0a; // Line Feed
3927 static const UChar chPound = 0x23; // '#', introduces a comment.
3928 static const UChar chDigit0 = 0x30; // '0'
3929 static const UChar chDigit7 = 0x37; // '9'
3930 static const UChar chColon = 0x3A; // ':'
3931 static const UChar chE = 0x45; // 'E'
3932 static const UChar chQ = 0x51; // 'Q'
3933 //static const UChar chN = 0x4E; // 'N'
3934 static const UChar chP = 0x50; // 'P'
3935 static const UChar chBackSlash = 0x5c; // '\' introduces a char escape
3936 //static const UChar chLBracket = 0x5b; // '['
3937 static const UChar chRBracket = 0x5d; // ']'
3938 static const UChar chUp = 0x5e; // '^'
3939 static const UChar chLowerP = 0x70;
3940 static const UChar chLBrace = 0x7b; // '{'
3941 static const UChar chRBrace = 0x7d; // '}'
3942 static const UChar chNEL = 0x85; // NEL newline variant
3943 static const UChar chLS = 0x2028; // Unicode Line Separator
3944
3945
3946 //------------------------------------------------------------------------------
3947 //
3948 // nextCharLL Low Level Next Char from the regex pattern.
3949 // Get a char from the string, keep track of input position
3950 // for error reporting.
3951 //
3952 //------------------------------------------------------------------------------
3953 UChar32 RegexCompile::nextCharLL() {
3954 UChar32 ch;
3955
3956 if (fPeekChar != -1) {
3957 ch = fPeekChar;
3958 fPeekChar = -1;
3959 return ch;
3960 }
3961
3962 // assume we're already in the right place
3963 ch = UTEXT_NEXT32(fRXPat->fPattern);
3964 if (ch == U_SENTINEL) {
3965 return ch;
3966 }
3967
3968 if (ch == chCR ||
3969 ch == chNEL ||
3970 ch == chLS ||
3971 (ch == chLF && fLastChar != chCR)) {
3972 // Character is starting a new line. Bump up the line number, and
3973 // reset the column to 0.
3974 fLineNum++;
3975 fCharNum=0;
3976 }
3977 else {
3978 // Character is not starting a new line. Except in the case of a
3979 // LF following a CR, increment the column position.
3980 if (ch != chLF) {
3981 fCharNum++;
3982 }
3983 }
3984 fLastChar = ch;
3985 return ch;
3986 }
3987
3988 //------------------------------------------------------------------------------
3989 //
3990 // peekCharLL Low Level Character Scanning, sneak a peek at the next
3991 // character without actually getting it.
3992 //
3993 //------------------------------------------------------------------------------
3994 UChar32 RegexCompile::peekCharLL() {
3995 if (fPeekChar == -1) {
3996 fPeekChar = nextCharLL();
3997 }
3998 return fPeekChar;
3999 }
4000
4001
4002 //------------------------------------------------------------------------------
4003 //
4004 // nextChar for pattern scanning. At this level, we handle stripping
4005 // out comments and processing some backslash character escapes.
4006 // The rest of the pattern grammar is handled at the next level up.
4007 //
4008 //------------------------------------------------------------------------------
4009 void RegexCompile::nextChar(RegexPatternChar &c) {
4010
4011 fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4012 c.fChar = nextCharLL();
4013 c.fQuoted = FALSE;
4014
4015 if (fQuoteMode) {
4016 c.fQuoted = TRUE;
4017 if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == 0)) ||
4018 c.fChar == (UChar32)-1) {
4019 fQuoteMode = FALSE; // Exit quote mode,
4020 nextCharLL(); // discard the E
4021 nextChar(c); // recurse to get the real next char
4022 }
4023 }
4024 else if (fInBackslashQuote) {
4025 // The current character immediately follows a '\'
4026 // Don't check for any further escapes, just return it as-is.
4027 // Don't set c.fQuoted, because that would prevent the state machine from
4028 // dispatching on the character.
4029 fInBackslashQuote = FALSE;
4030 }
4031 else
4032 {
4033 // We are not in a \Q quoted region \E of the source.
4034 //
4035 if (fModeFlags & UREGEX_COMMENTS) {
4036 //
4037 // We are in free-spacing and comments mode.
4038 // Scan through any white space and comments, until we
4039 // reach a significant character or the end of inut.
4040 for (;;) {
4041 if (c.fChar == (UChar32)-1) {
4042 break; // End of Input
4043 }
4044 if (c.fChar == chPound && fEOLComments == TRUE) {
4045 // Start of a comment. Consume the rest of it, until EOF or a new line
4046 for (;;) {
4047 c.fChar = nextCharLL();
4048 if (c.fChar == (UChar32)-1 || // EOF
4049 c.fChar == chCR ||
4050 c.fChar == chLF ||
4051 c.fChar == chNEL ||
4052 c.fChar == chLS) {
4053 break;
4054 }
4055 }
4056 }
4057 // TODO: check what Java & Perl do with non-ASCII white spaces. Ticket 6061.
4058 if (PatternProps::isWhiteSpace(c.fChar) == FALSE) {
4059 break;
4060 }
4061 c.fChar = nextCharLL();
4062 }
4063 }
4064
4065 //
4066 // check for backslash escaped characters.
4067 //
4068 if (c.fChar == chBackSlash) {
4069 int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4070 if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
4071 //
4072 // A '\' sequence that is handled by ICU's standard unescapeAt function.
4073 // Includes \uxxxx, \n, \r, many others.
4074 // Return the single equivalent character.
4075 //
4076 nextCharLL(); // get & discard the peeked char.
4077 c.fQuoted = TRUE;
4078
4079 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) {
4080 int32_t endIndex = (int32_t)pos;
4081 c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents);
4082
4083 if (endIndex == pos) {
4084 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4085 }
4086 fCharNum += endIndex - pos;
4087 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex);
4088 } else {
4089 int32_t offset = 0;
4090 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern);
4091
4092 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos);
4093 c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
4094
4095 if (offset == 0) {
4096 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4097 } else if (context.lastOffset == offset) {
4098 UTEXT_PREVIOUS32(fRXPat->fPattern);
4099 } else if (context.lastOffset != offset-1) {
4100 utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1);
4101 }
4102 fCharNum += offset;
4103 }
4104 }
4105 else if (peekCharLL() == chDigit0) {
4106 // Octal Escape, using Java Regexp Conventions
4107 // which are \0 followed by 1-3 octal digits.
4108 // Different from ICU Unescape handling of Octal, which does not
4109 // require the leading 0.
4110 // Java also has the convention of only consuming 2 octal digits if
4111 // the three digit number would be > 0xff
4112 //
4113 c.fChar = 0;
4114 nextCharLL(); // Consume the initial 0.
4115 int index;
4116 for (index=0; index<3; index++) {
4117 int32_t ch = peekCharLL();
4118 if (ch<chDigit0 || ch>chDigit7) {
4119 if (index==0) {
4120 // \0 is not followed by any octal digits.
4121 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4122 }
4123 break;
4124 }
4125 c.fChar <<= 3;
4126 c.fChar += ch&7;
4127 if (c.fChar <= 255) {
4128 nextCharLL();
4129 } else {
4130 // The last digit made the number too big. Forget we saw it.
4131 c.fChar >>= 3;
4132 }
4133 }
4134 c.fQuoted = TRUE;
4135 }
4136 else if (peekCharLL() == chQ) {
4137 // "\Q" enter quote mode, which will continue until "\E"
4138 fQuoteMode = TRUE;
4139 nextCharLL(); // discard the 'Q'.
4140 nextChar(c); // recurse to get the real next char.
4141 }
4142 else
4143 {
4144 // We are in a '\' escape that will be handled by the state table scanner.
4145 // Just return the backslash, but remember that the following char is to
4146 // be taken literally.
4147 fInBackslashQuote = TRUE;
4148 }
4149 }
4150 }
4151
4152 // re-enable # to end-of-line comments, in case they were disabled.
4153 // They are disabled by the parser upon seeing '(?', but this lasts for
4154 // the fetching of the next character only.
4155 fEOLComments = TRUE;
4156
4157 // putc(c.fChar, stdout);
4158 }
4159
4160
4161
4162 //------------------------------------------------------------------------------
4163 //
4164 // scanNamedChar
4165 // Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
4166 //
4167 // The scan position will be at the 'N'. On return
4168 // the scan position should be just after the '}'
4169 //
4170 // Return the UChar32
4171 //
4172 //------------------------------------------------------------------------------
4173 UChar32 RegexCompile::scanNamedChar() {
4174 if (U_FAILURE(*fStatus)) {
4175 return 0;
4176 }
4177
4178 nextChar(fC);
4179 if (fC.fChar != chLBrace) {
4180 error(U_REGEX_PROPERTY_SYNTAX);
4181 return 0;
4182 }
4183
4184 UnicodeString charName;
4185 for (;;) {
4186 nextChar(fC);
4187 if (fC.fChar == chRBrace) {
4188 break;
4189 }
4190 if (fC.fChar == -1) {
4191 error(U_REGEX_PROPERTY_SYNTAX);
4192 return 0;
4193 }
4194 charName.append(fC.fChar);
4195 }
4196
4197 char name[100];
4198 if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) ||
4199 (uint32_t)charName.length()>=sizeof(name)) {
4200 // All Unicode character names have only invariant characters.
4201 // The API to get a character, given a name, accepts only char *, forcing us to convert,
4202 // which requires this error check
4203 error(U_REGEX_PROPERTY_SYNTAX);
4204 return 0;
4205 }
4206 charName.extract(0, charName.length(), name, sizeof(name), US_INV);
4207
4208 UChar32 theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus);
4209 if (U_FAILURE(*fStatus)) {
4210 error(U_REGEX_PROPERTY_SYNTAX);
4211 }
4212
4213 nextChar(fC); // Continue overall regex pattern processing with char after the '}'
4214 return theChar;
4215 }
4216
4217 //------------------------------------------------------------------------------
4218 //
4219 // scanProp Construct a UnicodeSet from the text at the current scan
4220 // position, which will be of the form \p{whaterver}
4221 //
4222 // The scan position will be at the 'p' or 'P'. On return
4223 // the scan position should be just after the '}'
4224 //
4225 // Return a UnicodeSet, constructed from the \P pattern,
4226 // or NULL if the pattern is invalid.
4227 //
4228 //------------------------------------------------------------------------------
4229 UnicodeSet *RegexCompile::scanProp() {
4230 UnicodeSet *uset = NULL;
4231
4232 if (U_FAILURE(*fStatus)) {
4233 return NULL;
4234 }
4235 (void)chLowerP; // Suppress compiler unused variable warning.
4236 U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP);
4237 UBool negated = (fC.fChar == chP);
4238
4239 UnicodeString propertyName;
4240 nextChar(fC);
4241 if (fC.fChar != chLBrace) {
4242 error(U_REGEX_PROPERTY_SYNTAX);
4243 return NULL;
4244 }
4245 for (;;) {
4246 nextChar(fC);
4247 if (fC.fChar == chRBrace) {
4248 break;
4249 }
4250 if (fC.fChar == -1) {
4251 // Hit the end of the input string without finding the closing '}'
4252 error(U_REGEX_PROPERTY_SYNTAX);
4253 return NULL;
4254 }
4255 propertyName.append(fC.fChar);
4256 }
4257 uset = createSetForProperty(propertyName, negated);
4258 nextChar(fC); // Move input scan to position following the closing '}'
4259 return uset;
4260 }
4261
4262 //------------------------------------------------------------------------------
4263 //
4264 // scanPosixProp Construct a UnicodeSet from the text at the current scan
4265 // position, which is expected be of the form [:property expression:]
4266 //
4267 // The scan position will be at the opening ':'. On return
4268 // the scan position must be on the closing ']'
4269 //
4270 // Return a UnicodeSet constructed from the pattern,
4271 // or NULL if this is not a valid POSIX-style set expression.
4272 // If not a property expression, restore the initial scan position
4273 // (to the opening ':')
4274 //
4275 // Note: the opening '[:' is not sufficient to guarantee that
4276 // this is a [:property:] expression.
4277 // [:'+=,] is a perfectly good ordinary set expression that
4278 // happens to include ':' as one of its characters.
4279 //
4280 //------------------------------------------------------------------------------
4281 UnicodeSet *RegexCompile::scanPosixProp() {
4282 UnicodeSet *uset = NULL;
4283
4284 if (U_FAILURE(*fStatus)) {
4285 return NULL;
4286 }
4287
4288 U_ASSERT(fC.fChar == chColon);
4289
4290 // Save the scanner state.
4291 // TODO: move this into the scanner, with the state encapsulated in some way. Ticket 6062
4292 int64_t savedScanIndex = fScanIndex;
4293 int64_t savedNextIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4294 UBool savedQuoteMode = fQuoteMode;
4295 UBool savedInBackslashQuote = fInBackslashQuote;
4296 UBool savedEOLComments = fEOLComments;
4297 int64_t savedLineNum = fLineNum;
4298 int64_t savedCharNum = fCharNum;
4299 UChar32 savedLastChar = fLastChar;
4300 UChar32 savedPeekChar = fPeekChar;
4301 RegexPatternChar savedfC = fC;
4302
4303 // Scan for a closing ]. A little tricky because there are some perverse
4304 // edge cases possible. "[:abc\Qdef:] \E]" is a valid non-property expression,
4305 // ending on the second closing ].
4306
4307 UnicodeString propName;
4308 UBool negated = FALSE;
4309
4310 // Check for and consume the '^' in a negated POSIX property, e.g. [:^Letter:]
4311 nextChar(fC);
4312 if (fC.fChar == chUp) {
4313 negated = TRUE;
4314 nextChar(fC);
4315 }
4316
4317 // Scan for the closing ":]", collecting the property name along the way.
4318 UBool sawPropSetTerminator = FALSE;
4319 for (;;) {
4320 propName.append(fC.fChar);
4321 nextChar(fC);
4322 if (fC.fQuoted || fC.fChar == -1) {
4323 // Escaped characters or end of input - either says this isn't a [:Property:]
4324 break;
4325 }
4326 if (fC.fChar == chColon) {
4327 nextChar(fC);
4328 if (fC.fChar == chRBracket) {
4329 sawPropSetTerminator = TRUE;
4330 }
4331 break;
4332 }
4333 }
4334
4335 if (sawPropSetTerminator) {
4336 uset = createSetForProperty(propName, negated);
4337 }
4338 else
4339 {
4340 // No closing ":]".
4341 // Restore the original scan position.
4342 // The main scanner will retry the input as a normal set expression,
4343 // not a [:Property:] expression.
4344 fScanIndex = savedScanIndex;
4345 fQuoteMode = savedQuoteMode;
4346 fInBackslashQuote = savedInBackslashQuote;
4347 fEOLComments = savedEOLComments;
4348 fLineNum = savedLineNum;
4349 fCharNum = savedCharNum;
4350 fLastChar = savedLastChar;
4351 fPeekChar = savedPeekChar;
4352 fC = savedfC;
4353 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex);
4354 }
4355 return uset;
4356 }
4357
4358 static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
4359 set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
4360 addCategory(set, U_GC_CF_MASK, ec);
4361 }
4362
4363 //
4364 // Create a Unicode Set from a Unicode Property expression.
4365 // This is common code underlying both \p{...} ane [:...:] expressions.
4366 // Includes trying the Java "properties" that aren't supported as
4367 // normal ICU UnicodeSet properties
4368 //
4369 static const UChar posSetPrefix[] = {0x5b, 0x5c, 0x70, 0x7b, 0}; // "[\p{"
4370 static const UChar negSetPrefix[] = {0x5b, 0x5c, 0x50, 0x7b, 0}; // "[\P{"
4371 UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) {
4372 UnicodeString setExpr;
4373 UnicodeSet *set;
4374 uint32_t usetFlags = 0;
4375
4376 if (U_FAILURE(*fStatus)) {
4377 return NULL;
4378 }
4379
4380 //
4381 // First try the property as we received it
4382 //
4383 if (negated) {
4384 setExpr.append(negSetPrefix, -1);
4385 } else {
4386 setExpr.append(posSetPrefix, -1);
4387 }
4388 setExpr.append(propName);
4389 setExpr.append(chRBrace);
4390 setExpr.append(chRBracket);
4391 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
4392 usetFlags |= USET_CASE_INSENSITIVE;
4393 }
4394 set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
4395 if (U_SUCCESS(*fStatus)) {
4396 return set;
4397 }
4398 delete set;
4399 set = NULL;
4400
4401 //
4402 // The property as it was didn't work.
4403
4404 // Do [:word:]. It is not recognized as a property by UnicodeSet. "word" not standard POSIX
4405 // or standard Java, but many other regular expression packages do recognize it.
4406
4407 if (propName.caseCompare(UNICODE_STRING_SIMPLE("word"), 0) == 0) {
4408 *fStatus = U_ZERO_ERROR;
4409 set = new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET]));
4410 if (set == NULL) {
4411 *fStatus = U_MEMORY_ALLOCATION_ERROR;
4412 return set;
4413 }
4414 if (negated) {
4415 set->complement();
4416 }
4417 return set;
4418 }
4419
4420
4421 // Do Java fixes -
4422 // InGreek -> InGreek or Coptic, that being the official Unicode name for that block.
4423 // InCombiningMarksforSymbols -> InCombiningDiacriticalMarksforSymbols.
4424 //
4425 // Note on Spaces: either "InCombiningMarksForSymbols" or "InCombining Marks for Symbols"
4426 // is accepted by Java. The property part of the name is compared
4427 // case-insenstively. The spaces must be exactly as shown, either
4428 // all there, or all omitted, with exactly one at each position
4429 // if they are present. From checking against JDK 1.6
4430 //
4431 // This code should be removed when ICU properties support the Java compatibility names
4432 // (ICU 4.0?)
4433 //
4434 UnicodeString mPropName = propName;
4435 if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InGreek"), 0) == 0) {
4436 mPropName = UNICODE_STRING_SIMPLE("InGreek and Coptic");
4437 }
4438 if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombining Marks for Symbols"), 0) == 0 ||
4439 mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombiningMarksforSymbols"), 0) == 0) {
4440 mPropName = UNICODE_STRING_SIMPLE("InCombining Diacritical Marks for Symbols");
4441 }
4442 else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
4443 mPropName = UNICODE_STRING_SIMPLE("javaValidCodePoint");
4444 }
4445
4446 // See if the property looks like a Java "InBlockName", which
4447 // we will recast as "Block=BlockName"
4448 //
4449 static const UChar IN[] = {0x49, 0x6E, 0}; // "In"
4450 static const UChar BLOCK[] = {0x42, 0x6C, 0x6f, 0x63, 0x6b, 0x3d, 00}; // "Block="
4451 if (mPropName.startsWith(IN, 2) && propName.length()>=3) {
4452 setExpr.truncate(4); // Leaves "[\p{", or "[\P{"
4453 setExpr.append(BLOCK, -1);
4454 setExpr.append(UnicodeString(mPropName, 2)); // Property with the leading "In" removed.
4455 setExpr.append(chRBrace);
4456 setExpr.append(chRBracket);
4457 *fStatus = U_ZERO_ERROR;
4458 set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
4459 if (U_SUCCESS(*fStatus)) {
4460 return set;
4461 }
4462 delete set;
4463 set = NULL;
4464 }
4465
4466 if (propName.startsWith(UNICODE_STRING_SIMPLE("java")) ||
4467 propName.compare(UNICODE_STRING_SIMPLE("all")) == 0)
4468 {
4469 UErrorCode localStatus = U_ZERO_ERROR;
4470 //setExpr.remove();
4471 set = new UnicodeSet();
4472 //
4473 // Try the various Java specific properties.
4474 // These all begin with "java"
4475 //
4476 if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDefined")) == 0) {
4477 addCategory(set, U_GC_CN_MASK, localStatus);
4478 set->complement();
4479 }
4480 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDigit")) == 0) {
4481 addCategory(set, U_GC_ND_MASK, localStatus);
4482 }
4483 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaIdentifierIgnorable")) == 0) {
4484 addIdentifierIgnorable(set, localStatus);
4485 }
4486 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaISOControl")) == 0) {
4487 set->add(0, 0x1F).add(0x7F, 0x9F);
4488 }
4489 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierPart")) == 0) {
4490 addCategory(set, U_GC_L_MASK, localStatus);
4491 addCategory(set, U_GC_SC_MASK, localStatus);
4492 addCategory(set, U_GC_PC_MASK, localStatus);
4493 addCategory(set, U_GC_ND_MASK, localStatus);
4494 addCategory(set, U_GC_NL_MASK, localStatus);
4495 addCategory(set, U_GC_MC_MASK, localStatus);
4496 addCategory(set, U_GC_MN_MASK, localStatus);
4497 addIdentifierIgnorable(set, localStatus);
4498 }
4499 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierStart")) == 0) {
4500 addCategory(set, U_GC_L_MASK, localStatus);
4501 addCategory(set, U_GC_NL_MASK, localStatus);
4502 addCategory(set, U_GC_SC_MASK, localStatus);
4503 addCategory(set, U_GC_PC_MASK, localStatus);
4504 }
4505 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetter")) == 0) {
4506 addCategory(set, U_GC_L_MASK, localStatus);
4507 }
4508 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetterOrDigit")) == 0) {
4509 addCategory(set, U_GC_L_MASK, localStatus);
4510 addCategory(set, U_GC_ND_MASK, localStatus);
4511 }
4512 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLowerCase")) == 0) {
4513 addCategory(set, U_GC_LL_MASK, localStatus);
4514 }
4515 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaMirrored")) == 0) {
4516 set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, localStatus);
4517 }
4518 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSpaceChar")) == 0) {
4519 addCategory(set, U_GC_Z_MASK, localStatus);
4520 }
4521 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSupplementaryCodePoint")) == 0) {
4522 set->add(0x10000, UnicodeSet::MAX_VALUE);
4523 }
4524 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaTitleCase")) == 0) {
4525 addCategory(set, U_GC_LT_MASK, localStatus);
4526 }
4527 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierStart")) == 0) {
4528 addCategory(set, U_GC_L_MASK, localStatus);
4529 addCategory(set, U_GC_NL_MASK, localStatus);
4530 }
4531 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierPart")) == 0) {
4532 addCategory(set, U_GC_L_MASK, localStatus);
4533 addCategory(set, U_GC_PC_MASK, localStatus);
4534 addCategory(set, U_GC_ND_MASK, localStatus);
4535 addCategory(set, U_GC_NL_MASK, localStatus);
4536 addCategory(set, U_GC_MC_MASK, localStatus);
4537 addCategory(set, U_GC_MN_MASK, localStatus);
4538 addIdentifierIgnorable(set, localStatus);
4539 }
4540 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUpperCase")) == 0) {
4541 addCategory(set, U_GC_LU_MASK, localStatus);
4542 }
4543 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaValidCodePoint")) == 0) {
4544 set->add(0, UnicodeSet::MAX_VALUE);
4545 }
4546 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaWhitespace")) == 0) {
4547 addCategory(set, U_GC_Z_MASK, localStatus);
4548 set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
4549 set->add(9, 0x0d).add(0x1c, 0x1f);
4550 }
4551 else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
4552 set->add(0, UnicodeSet::MAX_VALUE);
4553 }
4554
4555 if (U_SUCCESS(localStatus) && !set->isEmpty()) {
4556 *fStatus = U_ZERO_ERROR;
4557 if (usetFlags & USET_CASE_INSENSITIVE) {
4558 set->closeOver(USET_CASE_INSENSITIVE);
4559 }
4560 if (negated) {
4561 set->complement();
4562 }
4563 return set;
4564 }
4565 delete set;
4566 set = NULL;
4567 }
4568 error(*fStatus);
4569 return NULL;
4570 }
4571
4572
4573
4574 //
4575 // SetEval Part of the evaluation of [set expressions].
4576 // Perform any pending (stacked) operations with precedence
4577 // equal or greater to that of the next operator encountered
4578 // in the expression.
4579 //
4580 void RegexCompile::setEval(int32_t nextOp) {
4581 UnicodeSet *rightOperand = NULL;
4582 UnicodeSet *leftOperand = NULL;
4583 for (;;) {
4584 U_ASSERT(fSetOpStack.empty()==FALSE);
4585 int32_t pendingSetOperation = fSetOpStack.peeki();
4586 if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) {
4587 break;
4588 }
4589 fSetOpStack.popi();
4590 U_ASSERT(fSetStack.empty() == FALSE);
4591 rightOperand = (UnicodeSet *)fSetStack.peek();
4592 switch (pendingSetOperation) {
4593 case setNegation:
4594 rightOperand->complement();
4595 break;
4596 case setCaseClose:
4597 // TODO: need a simple close function. Ticket 6065
4598 rightOperand->closeOver(USET_CASE_INSENSITIVE);
4599 rightOperand->removeAllStrings();
4600 break;
4601 case setDifference1:
4602 case setDifference2:
4603 fSetStack.pop();
4604 leftOperand = (UnicodeSet *)fSetStack.peek();
4605 leftOperand->removeAll(*rightOperand);
4606 delete rightOperand;
4607 break;
4608 case setIntersection1:
4609 case setIntersection2:
4610 fSetStack.pop();
4611 leftOperand = (UnicodeSet *)fSetStack.peek();
4612 leftOperand->retainAll(*rightOperand);
4613 delete rightOperand;
4614 break;
4615 case setUnion:
4616 fSetStack.pop();
4617 leftOperand = (UnicodeSet *)fSetStack.peek();
4618 leftOperand->addAll(*rightOperand);
4619 delete rightOperand;
4620 break;
4621 default:
4622 U_ASSERT(FALSE);
4623 break;
4624 }
4625 }
4626 }
4627
4628 void RegexCompile::setPushOp(int32_t op) {
4629 setEval(op);
4630 fSetOpStack.push(op, *fStatus);
4631 fSetStack.push(new UnicodeSet(), *fStatus);
4632 }
4633
4634 U_NAMESPACE_END
4635 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
4636