]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/regexcmp.cpp
ICU-8.11.4.tar.gz
[apple/icu.git] / icuSources / i18n / regexcmp.cpp
1
2 //
3 // file: regexcmp.cpp
4 //
5 // Copyright (C) 2002-2006 International Business Machines Corporation and others.
6 // All Rights Reserved.
7 //
8 // This file contains the ICU regular expression compiler, which is responsible
9 // for processing a regular expression pattern into the compiled form that
10 // is used by the match finding engine.
11 //
12
13 #include "unicode/utypes.h"
14
15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
16
17 #include "unicode/unistr.h"
18 #include "unicode/uniset.h"
19 #include "unicode/uchar.h"
20 #include "unicode/uchriter.h"
21 #include "unicode/parsepos.h"
22 #include "unicode/parseerr.h"
23 #include "unicode/regex.h"
24 #include "util.h"
25 #include "cmemory.h"
26 #include "cstring.h"
27 #include "uvectr32.h"
28 #include "uassert.h"
29 #include "ucln_in.h"
30 #include "mutex.h"
31
32 #include "regeximp.h"
33 #include "regexcst.h" // Contains state table for the regex pattern parser.
34 // generated by a Perl script.
35 #include "regexcmp.h"
36 #include "regexst.h"
37
38
39
40 U_NAMESPACE_BEGIN
41
42
43
44
45
46 //------------------------------------------------------------------------------
47 //
48 // Constructor.
49 //
50 //------------------------------------------------------------------------------
51 RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : fParenStack(status)
52 {
53 fStatus = &status;
54
55 fRXPat = rxp;
56 fScanIndex = 0;
57 fNextIndex = 0;
58 fPeekChar = -1;
59 fLineNum = 1;
60 fCharNum = 0;
61 fQuoteMode = FALSE;
62 fInBackslashQuote = FALSE;
63 fModeFlags = fRXPat->fFlags | 0x80000000;
64 fEOLComments = TRUE;
65
66 fMatchOpenParen = -1;
67 fMatchCloseParen = -1;
68 fStringOpStart = -1;
69
70 if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
71 status = rxp->fDeferredStatus;
72 }
73 }
74
75
76
77 //------------------------------------------------------------------------------
78 //
79 // Destructor
80 //
81 //------------------------------------------------------------------------------
82 RegexCompile::~RegexCompile() {
83 }
84
85 //------------------------------------------------------------------------------
86 //
87 // Compile regex pattern. The state machine for rexexp pattern parsing is here.
88 // The state tables are hand-written in the file regexcst.txt,
89 // and converted to the form used here by a perl
90 // script regexcst.pl
91 //
92 //------------------------------------------------------------------------------
93 void RegexCompile::compile(
94 const UnicodeString &pat, // Source pat to be compiled.
95 UParseError &pp, // Error position info
96 UErrorCode &e) // Error Code
97 {
98 fStatus = &e;
99 fParseErr = &pp;
100 fStackPtr = 0;
101 fStack[fStackPtr] = 0;
102
103 if (U_FAILURE(*fStatus)) {
104 return;
105 }
106
107 // There should be no pattern stuff in the RegexPattern object. They can not be reused.
108 U_ASSERT(fRXPat->fPattern.length() == 0);
109
110 // Prepare the RegexPattern object to receive the compiled pattern.
111 // TODO: remove per-instance field, and just use globals directly. (But check perf)
112 fRXPat->fPattern = pat;
113 fRXPat->fStaticSets = RegexStaticSets::gStaticSets->fPropSets;
114 fRXPat->fStaticSets8 = RegexStaticSets::gStaticSets->fPropSets8;
115
116
117 // Initialize the pattern scanning state machine
118 fPatternLength = pat.length();
119 uint16_t state = 1;
120 const RegexTableEl *tableEl;
121 nextChar(fC); // Fetch the first char from the pattern string.
122
123 //
124 // Main loop for the regex pattern parsing state machine.
125 // Runs once per state transition.
126 // Each time through optionally performs, depending on the state table,
127 // - an advance to the the next pattern char
128 // - an action to be performed.
129 // - pushing or popping a state to/from the local state return stack.
130 // file regexcst.txt is the source for the state table. The logic behind
131 // recongizing the pattern syntax is there, not here.
132 //
133 for (;;) {
134 // Bail out if anything has gone wrong.
135 // Regex pattern parsing stops on the first error encountered.
136 if (U_FAILURE(*fStatus)) {
137 break;
138 }
139
140 U_ASSERT(state != 0);
141
142 // Find the state table element that matches the input char from the pattern, or the
143 // class of the input character. Start with the first table row for this
144 // state, then linearly scan forward until we find a row that matches the
145 // character. The last row for each state always matches all characters, so
146 // the search will stop there, if not before.
147 //
148 tableEl = &gRuleParseStateTable[state];
149 REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d) state=%s ",
150 fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
151
152 for (;;) { // loop through table rows belonging to this state, looking for one
153 // that matches the current input char.
154 REGEX_SCAN_DEBUG_PRINTF(("."));
155 if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE && tableEl->fCharClass == fC.fChar) {
156 // Table row specified an individual character, not a set, and
157 // the input character is not quoted, and
158 // the input character matched it.
159 break;
160 }
161 if (tableEl->fCharClass == 255) {
162 // Table row specified default, match anything character class.
163 break;
164 }
165 if (tableEl->fCharClass == 254 && fC.fQuoted) {
166 // Table row specified "quoted" and the char was quoted.
167 break;
168 }
169 if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1) {
170 // Table row specified eof and we hit eof on the input.
171 break;
172 }
173
174 if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class &&
175 fC.fQuoted == FALSE && // char is not escaped &&
176 fC.fChar != (UChar32)-1) { // char is not EOF
177 UnicodeSet *uniset = RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128];
178 if (uniset->contains(fC.fChar)) {
179 // Table row specified a character class, or set of characters,
180 // and the current char matches it.
181 break;
182 }
183 }
184
185 // No match on this row, advance to the next row for this state,
186 tableEl++;
187 }
188 REGEX_SCAN_DEBUG_PRINTF(("\n"));
189
190 //
191 // We've found the row of the state table that matches the current input
192 // character from the rules string.
193 // Perform any action specified by this row in the state table.
194 if (doParseActions((EParseAction)tableEl->fAction) == FALSE) {
195 // Break out of the state machine loop if the
196 // the action signalled some kind of error, or
197 // the action was to exit, occurs on normal end-of-rules-input.
198 break;
199 }
200
201 if (tableEl->fPushState != 0) {
202 fStackPtr++;
203 if (fStackPtr >= kStackSize) {
204 error(U_REGEX_INTERNAL_ERROR);
205 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
206 fStackPtr--;
207 }
208 fStack[fStackPtr] = tableEl->fPushState;
209 }
210
211 //
212 // NextChar. This is where characters are actually fetched from the pattern.
213 // Happens under control of the 'n' tag in the state table.
214 //
215 if (tableEl->fNextChar) {
216 nextChar(fC);
217 }
218
219 // Get the next state from the table entry, or from the
220 // state stack if the next state was specified as "pop".
221 if (tableEl->fNextState != 255) {
222 state = tableEl->fNextState;
223 } else {
224 state = fStack[fStackPtr];
225 fStackPtr--;
226 if (fStackPtr < 0) {
227 // state stack underflow
228 // This will occur if the user pattern has mis-matched parentheses,
229 // with extra close parens.
230 //
231 fStackPtr++;
232 error(U_REGEX_MISMATCHED_PAREN);
233 }
234 }
235
236 }
237
238 //
239 // The pattern has now been read and processed, and the compiled code generated.
240 //
241
242 // Back-reference fixup
243 //
244 int32_t loc;
245 for (loc=0; loc<fRXPat->fCompiledPat->size(); loc++) {
246 int32_t op = fRXPat->fCompiledPat->elementAti(loc);
247 int32_t opType = URX_TYPE(op);
248 if (opType == URX_BACKREF || opType == URX_BACKREF_I) {
249 int32_t where = URX_VAL(op);
250 if (where > fRXPat->fGroupMap->size()) {
251 error(U_REGEX_INVALID_BACK_REF);
252 break;
253 }
254 where = fRXPat->fGroupMap->elementAti(where-1);
255 op = URX_BUILD(opType, where);
256 fRXPat->fCompiledPat->setElementAt(op, loc);
257 }
258 }
259
260
261 //
262 // Compute the number of digits requried for the largest capture group number.
263 //
264 fRXPat->fMaxCaptureDigits = 1;
265 int32_t n = 10;
266 for (;;) {
267 if (n > fRXPat->fGroupMap->size()) {
268 break;
269 }
270 fRXPat->fMaxCaptureDigits++;
271 n *= 10;
272 }
273
274 //
275 // The pattern's fFrameSize so far has accumulated the requirements for
276 // storage for capture parentheses, counters, etc. that are encountered
277 // in the pattern. Add space for the two variables that are always
278 // present in the saved state: the input string position and the
279 // position in the compiled pattern.
280 //
281 fRXPat->fFrameSize+=2;
282
283 //
284 // Get bounds for the minimum and maximum length of a string that this
285 // pattern can match. Used to avoid looking for matches in strings that
286 // are too short.
287 //
288 fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
289
290 //
291 // Optimization passes
292 //
293 matchStartType();
294 OptDotStar();
295 stripNOPs();
296
297 //
298 // Set up fast latin-1 range sets
299 //
300 int32_t numSets = fRXPat->fSets->size();
301 fRXPat->fSets8 = new Regex8BitSet[numSets];
302 int32_t i;
303 for (i=0; i<numSets; i++) {
304 UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i);
305 fRXPat->fSets8[i].init(s);
306 }
307
308 }
309
310
311
312
313
314 //------------------------------------------------------------------------------
315 //
316 // doParseAction Do some action during regex pattern parsing.
317 // Called by the parse state machine.
318 //
319 // Generation of the match engine PCode happens here, or
320 // in functions called from the parse actions defined here.
321 //
322 //
323 //------------------------------------------------------------------------------
324 UBool RegexCompile::doParseActions(EParseAction action)
325 {
326 UBool returnVal = TRUE;
327
328 switch ((Regex_PatternParseAction)action) {
329
330 case doPatStart:
331 // Start of pattern compiles to:
332 //0 SAVE 2 Fall back to position of FAIL
333 //1 jmp 3
334 //2 FAIL Stop if we ever reach here.
335 //3 NOP Dummy, so start of pattern looks the same as
336 // the start of an ( grouping.
337 //4 NOP Resreved, will be replaced by a save if there are
338 // OR | operators at the top level
339 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_STATE_SAVE, 2), *fStatus);
340 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_JMP, 3), *fStatus);
341 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_FAIL, 0), *fStatus);
342
343 // Standard open nonCapture paren action emits the two NOPs and
344 // sets up the paren stack frame.
345 doParseActions((EParseAction)doOpenNonCaptureParen);
346 break;
347
348 case doPatFinish:
349 // We've scanned to the end of the pattern
350 // The end of pattern compiles to:
351 // URX_END
352 // which will stop the runtime match engine.
353 // Encountering end of pattern also behaves like a close paren,
354 // and forces fixups of the State Save at the beginning of the compiled pattern
355 // and of any OR operations at the top level.
356 //
357 handleCloseParen();
358 if (fParenStack.size() > 0) {
359 // Missing close paren in pattern.
360 error(U_REGEX_MISMATCHED_PAREN);
361 }
362
363 // add the END operation to the compiled pattern.
364 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_END, 0), *fStatus);
365
366 // Terminate the pattern compilation state machine.
367 returnVal = FALSE;
368 break;
369
370
371
372 case doOrOperator:
373 // Scanning a '|', as in (A|B)
374 {
375 // Insert a SAVE operation at the start of the pattern section preceding
376 // this OR at this level. This SAVE will branch the match forward
377 // to the right hand side of the OR in the event that the left hand
378 // side fails to match and backtracks. Locate the position for the
379 // save from the location on the top of the parentheses stack.
380 int32_t savePosition = fParenStack.popi();
381 int32_t op = fRXPat->fCompiledPat->elementAti(savePosition);
382 U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserved location
383 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
384 fRXPat->fCompiledPat->setElementAt(op, savePosition);
385
386 // Append an JMP operation into the compiled pattern. The operand for
387 // the JMP will eventually be the location following the ')' for the
388 // group. This will be patched in later, when the ')' is encountered.
389 op = URX_BUILD(URX_JMP, 0);
390 fRXPat->fCompiledPat->addElement(op, *fStatus);
391
392 // Push the position of the newly added JMP op onto the parentheses stack.
393 // This registers if for fixup when this block's close paren is encountered.
394 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
395
396 // Append a NOP to the compiled pattern. This is the slot reserved
397 // for a SAVE in the event that there is yet another '|' following
398 // this one.
399 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
400 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
401 }
402 break;
403
404
405 case doOpenCaptureParen:
406 // Open Paren.
407 // Compile to a
408 // - NOP, which later may be replaced by a save-state if the
409 // parenthesized group gets a * quantifier, followed by
410 // - START_CAPTURE n where n is stack frame offset to the capture group variables.
411 // - NOP, which may later be replaced by a save-state if there
412 // is an '|' alternation within the parens.
413 //
414 // Each capture group gets three slots in the save stack frame:
415 // 0: Capture Group start position (in input string being matched.)
416 // 1: Capture Group end positino.
417 // 2: Start of Match-in-progress.
418 // The first two locations are for a completed capture group, and are
419 // referred to by back references and the like.
420 // The third location stores the capture start position when an START_CAPTURE is
421 // encountered. This will be promoted to a completed capture when (and if) the corresponding
422 // END_CAPure is encountered.
423 {
424 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
425 int32_t varsLoc = fRXPat->fFrameSize; // Reserve three slots in match stack frame.
426 fRXPat->fFrameSize += 3;
427 int32_t cop = URX_BUILD(URX_START_CAPTURE, varsLoc);
428 fRXPat->fCompiledPat->addElement(cop, *fStatus);
429 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
430
431 // On the Parentheses stack, start a new frame and add the postions
432 // of the two NOPs. Depending on what follows in the pattern, the
433 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
434 // address of the end of the parenthesized group.
435 fParenStack.push(fModeFlags, *fStatus); // Match mode state
436 fParenStack.push(capturing, *fStatus); // Frame type.
437 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP location
438 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc
439
440 // Save the mapping from group number to stack frame variable position.
441 fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
442 }
443 break;
444
445 case doOpenNonCaptureParen:
446 // Open non-caputuring (grouping only) Paren.
447 // Compile to a
448 // - NOP, which later may be replaced by a save-state if the
449 // parenthesized group gets a * quantifier, followed by
450 // - NOP, which may later be replaced by a save-state if there
451 // is an '|' alternation within the parens.
452 {
453 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
454 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
455
456 // On the Parentheses stack, start a new frame and add the postions
457 // of the two NOPs.
458 fParenStack.push(fModeFlags, *fStatus); // Match mode state
459 fParenStack.push(plain, *fStatus); // Begin a new frame.
460 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
461 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc
462 }
463 break;
464
465
466 case doOpenAtomicParen:
467 // Open Atomic Paren. (?>
468 // Compile to a
469 // - NOP, which later may be replaced if the parenthesized group
470 // has a quantifier, followed by
471 // - STO_SP save state stack position, so it can be restored at the ")"
472 // - NOP, which may later be replaced by a save-state if there
473 // is an '|' alternation within the parens.
474 {
475 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
476 int32_t varLoc = fRXPat->fDataSize; // Reserve a data location for saving the
477 fRXPat->fDataSize += 1; // state stack ptr.
478 int32_t stoOp = URX_BUILD(URX_STO_SP, varLoc);
479 fRXPat->fCompiledPat->addElement(stoOp, *fStatus);
480 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
481
482 // On the Parentheses stack, start a new frame and add the postions
483 // of the two NOPs. Depending on what follows in the pattern, the
484 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
485 // address of the end of the parenthesized group.
486 fParenStack.push(fModeFlags, *fStatus); // Match mode state
487 fParenStack.push(atomic, *fStatus); // Frame type.
488 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP
489 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
490 }
491 break;
492
493
494 case doOpenLookAhead:
495 // Positive Look-ahead (?= stuff )
496 // Compiles to
497 // 1 START_LA dataLoc
498 // 2. NOP reserved for use by quantifiers on the block.
499 // Look-ahead can't have quantifiers, but paren stack
500 // compile time conventions require the slot anyhow.
501 // 3. NOP may be replaced if there is are '|' ops in the block.
502 // 4. code for parenthesized stuff.
503 // 5. ENDLA
504 //
505 // Two data slots are reserved, for saving the stack ptr and the input position.
506 {
507 int32_t dataLoc = fRXPat->fDataSize;
508 fRXPat->fDataSize += 2;
509 int32_t op = URX_BUILD(URX_LA_START, dataLoc);
510 fRXPat->fCompiledPat->addElement(op, *fStatus);
511
512 op = URX_BUILD(URX_NOP, 0);
513 fRXPat->fCompiledPat->addElement(op, *fStatus);
514 fRXPat->fCompiledPat->addElement(op, *fStatus);
515
516 // On the Parentheses stack, start a new frame and add the postions
517 // of the NOPs.
518 fParenStack.push(fModeFlags, *fStatus); // Match mode state
519 fParenStack.push(lookAhead, *fStatus); // Frame type.
520 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
521 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location
522 }
523 break;
524
525 case doOpenLookAheadNeg:
526 // Negated Lookahead. (?! stuff )
527 // Compiles to
528 // 1. START_LA dataloc
529 // 2. SAVE_STATE 7 // Fail within look-ahead block restores to this state,
530 // // which continues with the match.
531 // 3. NOP // Std. Open Paren sequence, for possible '|'
532 // 4. code for parenthesized stuff.
533 // 5. END_LA // Cut back stack, remove saved state from step 2.
534 // 6. FAIL // code in block succeeded, so neg. lookahead fails.
535 // 7. ...
536 {
537 int32_t dataLoc = fRXPat->fDataSize;
538 fRXPat->fDataSize += 2;
539 int32_t op = URX_BUILD(URX_LA_START, dataLoc);
540 fRXPat->fCompiledPat->addElement(op, *fStatus);
541
542 op = URX_BUILD(URX_STATE_SAVE, 0); // dest address will be patched later.
543 fRXPat->fCompiledPat->addElement(op, *fStatus);
544
545 op = URX_BUILD(URX_NOP, 0);
546 fRXPat->fCompiledPat->addElement(op, *fStatus);
547
548 // On the Parentheses stack, start a new frame and add the postions
549 // of the StateSave and NOP.
550 fParenStack.push(fModeFlags, *fStatus); // Match mode state
551 fParenStack.push( negLookAhead, *fStatus); // Frame type
552 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The STATE_SAVE location
553 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location
554
555 // Instructions #5 and #6 will be added when the ')' is encountered.
556 }
557 break;
558
559 case doOpenLookBehind:
560 {
561 // Compile a (?<= look-behind open paren.
562 //
563 // Compiles to
564 // 0 URX_LB_START dataLoc
565 // 1 URX_LB_CONT dataLoc
566 // 2 MinMatchLen
567 // 3 MaxMatchLen
568 // 4 URX_NOP Standard '(' boilerplate.
569 // 5 URX_NOP Reserved slot for use with '|' ops within (block).
570 // 6 <code for LookBehind expression>
571 // 7 URX_LB_END dataLoc # Check match len, restore input len
572 // 8 URX_LA_END dataLoc # Restore stack, input pos
573 //
574 // Allocate a block of matcher data, to contain (when running a match)
575 // 0: Stack ptr on entry
576 // 1: Input Index on entry
577 // 2: Start index of match current match attempt.
578 // 3: Original Input String len.
579
580 // Allocate data space
581 int32_t dataLoc = fRXPat->fDataSize;
582 fRXPat->fDataSize += 4;
583
584 // Emit URX_LB_START
585 int32_t op = URX_BUILD(URX_LB_START, dataLoc);
586 fRXPat->fCompiledPat->addElement(op, *fStatus);
587
588 // Emit URX_LB_CONT
589 op = URX_BUILD(URX_LB_CONT, dataLoc);
590 fRXPat->fCompiledPat->addElement(op, *fStatus);
591 fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLength. To be filled later.
592 fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLength. To be filled later.
593
594 // Emit the NOP
595 op = URX_BUILD(URX_NOP, 0);
596 fRXPat->fCompiledPat->addElement(op, *fStatus);
597 fRXPat->fCompiledPat->addElement(op, *fStatus);
598
599 // On the Parentheses stack, start a new frame and add the postions
600 // of the URX_LB_CONT and the NOP.
601 fParenStack.push(fModeFlags, *fStatus); // Match mode state
602 fParenStack.push(lookBehind, *fStatus); // Frame type
603 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
604 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location
605
606 // The final two instructions will be added when the ')' is encountered.
607 }
608
609 break;
610
611 case doOpenLookBehindNeg:
612 {
613 // Compile a (?<! negated look-behind open paren.
614 //
615 // Compiles to
616 // 0 URX_LB_START dataLoc # Save entry stack, input len
617 // 1 URX_LBN_CONT dataLoc # Iterate possible match positions
618 // 2 MinMatchLen
619 // 3 MaxMatchLen
620 // 4 continueLoc (9)
621 // 5 URX_NOP Standard '(' boilerplate.
622 // 6 URX_NOP Reserved slot for use with '|' ops within (block).
623 // 7 <code for LookBehind expression>
624 // 8 URX_LBN_END dataLoc # Check match len, cause a FAIL
625 // 9 ...
626 //
627 // Allocate a block of matcher data, to contain (when running a match)
628 // 0: Stack ptr on entry
629 // 1: Input Index on entry
630 // 2: Start index of match current match attempt.
631 // 3: Original Input String len.
632
633 // Allocate data space
634 int32_t dataLoc = fRXPat->fDataSize;
635 fRXPat->fDataSize += 4;
636
637 // Emit URX_LB_START
638 int32_t op = URX_BUILD(URX_LB_START, dataLoc);
639 fRXPat->fCompiledPat->addElement(op, *fStatus);
640
641 // Emit URX_LBN_CONT
642 op = URX_BUILD(URX_LBN_CONT, dataLoc);
643 fRXPat->fCompiledPat->addElement(op, *fStatus);
644 fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLength. To be filled later.
645 fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLength. To be filled later.
646 fRXPat->fCompiledPat->addElement(0, *fStatus); // Continue Loc. To be filled later.
647
648 // Emit the NOP
649 op = URX_BUILD(URX_NOP, 0);
650 fRXPat->fCompiledPat->addElement(op, *fStatus);
651 fRXPat->fCompiledPat->addElement(op, *fStatus);
652
653 // On the Parentheses stack, start a new frame and add the postions
654 // of the URX_LB_CONT and the NOP.
655 fParenStack.push(fModeFlags, *fStatus); // Match mode state
656 fParenStack.push(lookBehindN, *fStatus); // Frame type
657 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
658 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location
659
660 // The final two instructions will be added when the ')' is encountered.
661 }
662 break;
663
664 case doConditionalExpr:
665 // Conditionals such as (?(1)a:b)
666 case doPerlInline:
667 // Perl inline-condtionals. (?{perl code}a|b) We're not perl, no way to do them.
668 error(U_REGEX_UNIMPLEMENTED);
669 break;
670
671
672 case doCloseParen:
673 handleCloseParen();
674 if (fParenStack.size() <= 0) {
675 // Extra close paren, or missing open paren.
676 error(U_REGEX_MISMATCHED_PAREN);
677 }
678 break;
679
680 case doNOP:
681 break;
682
683
684 case doBadOpenParenType:
685 case doRuleError:
686 error(U_REGEX_RULE_SYNTAX);
687 break;
688
689
690 case doMismatchedParenErr:
691 error(U_REGEX_MISMATCHED_PAREN);
692 break;
693
694 case doPlus:
695 // Normal '+' compiles to
696 // 1. stuff to be repeated (already built)
697 // 2. jmp-sav 1
698 // 3. ...
699 //
700 // Or, if the item to be repeated can match a zero length string,
701 // 1. STO_INP_LOC data-loc
702 // 2. body of stuff to be repeated
703 // 3. JMP_SAV_X 2
704 // 4. ...
705
706 //
707 // Or, if the item to be repeated is simple
708 // 1. Item to be repeated.
709 // 2. LOOP_SR_I set number (assuming repeated item is a set ref)
710 // 3. LOOP_C stack location
711 {
712 int32_t topLoc = blockTopLoc(FALSE); // location of item #1
713 int32_t frameLoc;
714
715 // Check for simple constructs, which may get special optimized code.
716 if (topLoc == fRXPat->fCompiledPat->size() - 1) {
717 int32_t repeatedOp = fRXPat->fCompiledPat->elementAti(topLoc);
718
719 if (URX_TYPE(repeatedOp) == URX_SETREF) {
720 // Emit optimized code for [char set]+
721 int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp));
722 fRXPat->fCompiledPat->addElement(loopOpI, *fStatus);
723 frameLoc = fRXPat->fFrameSize;
724 fRXPat->fFrameSize++;
725 int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc);
726 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
727 break;
728 }
729
730 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
731 URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
732 // Emit Optimized code for .+ operations.
733 int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0);
734 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
735 // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
736 loopOpI |= 1;
737 }
738 fRXPat->fCompiledPat->addElement(loopOpI, *fStatus);
739 frameLoc = fRXPat->fFrameSize;
740 fRXPat->fFrameSize++;
741 int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc);
742 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
743 break;
744 }
745
746 }
747
748 // General case.
749
750 // Check for minimum match length of zero, which requires
751 // extra loop-breaking code.
752 if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) {
753 // Zero length match is possible.
754 // Emit the code sequence that can handle it.
755 insertOp(topLoc);
756 frameLoc = fRXPat->fFrameSize;
757 fRXPat->fFrameSize++;
758
759 int32_t op = URX_BUILD(URX_STO_INP_LOC, frameLoc);
760 fRXPat->fCompiledPat->setElementAt(op, topLoc);
761
762 op = URX_BUILD(URX_JMP_SAV_X, topLoc+1);
763 fRXPat->fCompiledPat->addElement(op, *fStatus);
764 } else {
765 // Simpler code when the repeated body must match something non-empty
766 int32_t jmpOp = URX_BUILD(URX_JMP_SAV, topLoc);
767 fRXPat->fCompiledPat->addElement(jmpOp, *fStatus);
768 }
769 }
770 break;
771
772 case doNGPlus:
773 // Non-greedy '+?' compiles to
774 // 1. stuff to be repeated (already built)
775 // 2. state-save 1
776 // 3. ...
777 {
778 int32_t topLoc = blockTopLoc(FALSE);
779 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, topLoc);
780 fRXPat->fCompiledPat->addElement(saveStateOp, *fStatus);
781 }
782 break;
783
784
785 case doOpt:
786 // Normal (greedy) ? quantifier.
787 // Compiles to
788 // 1. state save 3
789 // 2. body of optional block
790 // 3. ...
791 // Insert the state save into the compiled pattern, and we're done.
792 {
793 int32_t saveStateLoc = blockTopLoc(TRUE);
794 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
795 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
796 }
797 break;
798
799 case doNGOpt:
800 // Non-greedy ?? quantifier
801 // compiles to
802 // 1. jmp 4
803 // 2. body of optional block
804 // 3 jmp 5
805 // 4. state save 2
806 // 5 ...
807 // This code is less than ideal, with two jmps instead of one, because we can only
808 // insert one instruction at the top of the block being iterated.
809 {
810 int32_t jmp1_loc = blockTopLoc(TRUE);
811 int32_t jmp2_loc = fRXPat->fCompiledPat->size();
812
813 int32_t jmp1_op = URX_BUILD(URX_JMP, jmp2_loc+1);
814 fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
815
816 int32_t jmp2_op = URX_BUILD(URX_JMP, jmp2_loc+2);
817 fRXPat->fCompiledPat->addElement(jmp2_op, *fStatus);
818
819 int32_t save_op = URX_BUILD(URX_STATE_SAVE, jmp1_loc+1);
820 fRXPat->fCompiledPat->addElement(save_op, *fStatus);
821 }
822 break;
823
824
825 case doStar:
826 // Normal (greedy) * quantifier.
827 // Compiles to
828 // 1. STATE_SAVE 4
829 // 2. body of stuff being iterated over
830 // 3. JMP_SAV 2
831 // 4. ...
832 //
833 // Or, if the body is a simple [Set],
834 // 1. LOOP_SR_I set number
835 // 2. LOOP_C stack location
836 // ...
837 //
838 // Or if this is a .*
839 // 1. LOOP_DOT_I (. matches all mode flag)
840 // 2. LOOP_C stack location
841 //
842 // Or, if the body can match a zero-length string, to inhibit infinite loops,
843 // 1. STATE_SAVE 5
844 // 2. STO_INP_LOC data-loc
845 // 3. body of stuff
846 // 4. JMP_SAV_X 2
847 // 5. ...
848 {
849 // location of item #1, the STATE_SAVE
850 int32_t topLoc = blockTopLoc(FALSE);
851 int32_t dataLoc = -1;
852
853 // Check for simple *, where the construct being repeated
854 // compiled to single opcode, and might be optimizable.
855 if (topLoc == fRXPat->fCompiledPat->size() - 1) {
856 int32_t repeatedOp = fRXPat->fCompiledPat->elementAti(topLoc);
857
858 if (URX_TYPE(repeatedOp) == URX_SETREF) {
859 // Emit optimized code for a [char set]*
860 int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp));
861 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
862 dataLoc = fRXPat->fFrameSize;
863 fRXPat->fFrameSize++;
864 int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc);
865 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
866 break;
867 }
868
869 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
870 URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
871 // Emit Optimized code for .* operations.
872 int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0);
873 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
874 // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
875 loopOpI |= 1;
876 }
877 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
878 dataLoc = fRXPat->fFrameSize;
879 fRXPat->fFrameSize++;
880 int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc);
881 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
882 break;
883 }
884 }
885
886 // Emit general case code for this *
887 // The optimizations did not apply.
888
889 int32_t saveStateLoc = blockTopLoc(TRUE);
890 int32_t jmpOp = URX_BUILD(URX_JMP_SAV, saveStateLoc+1);
891
892 // Check for minimum match length of zero, which requires
893 // extra loop-breaking code.
894 if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
895 insertOp(saveStateLoc);
896 dataLoc = fRXPat->fFrameSize;
897 fRXPat->fFrameSize++;
898
899 int32_t op = URX_BUILD(URX_STO_INP_LOC, dataLoc);
900 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
901 jmpOp = URX_BUILD(URX_JMP_SAV_X, saveStateLoc+2);
902 }
903
904 // Locate the position in the compiled pattern where the match will continue
905 // after completing the *. (4 or 5 in the comment above)
906 int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
907
908 // Put together the save state op store it into the compiled code.
909 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc);
910 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
911
912 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
913 fRXPat->fCompiledPat->addElement(jmpOp, *fStatus);
914 }
915 break;
916
917 case doNGStar:
918 // Non-greedy *? quantifier
919 // compiles to
920 // 1. JMP 3
921 // 2. body of stuff being iterated over
922 // 3. STATE_SAVE 2
923 // 4 ...
924 {
925 int32_t jmpLoc = blockTopLoc(TRUE); // loc 1.
926 int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3.
927 int32_t jmpOp = URX_BUILD(URX_JMP, saveLoc);
928 int32_t stateSaveOp = URX_BUILD(URX_STATE_SAVE, jmpLoc+1);
929 fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
930 fRXPat->fCompiledPat->addElement(stateSaveOp, *fStatus);
931 }
932 break;
933
934
935 case doIntervalInit:
936 // The '{' opening an interval quantifier was just scanned.
937 // Init the counter varaiables that will accumulate the values as the digits
938 // are scanned.
939 fIntervalLow = 0;
940 fIntervalUpper = -1;
941 break;
942
943 case doIntevalLowerDigit:
944 // Scanned a digit from the lower value of an {lower,upper} interval
945 {
946 int32_t digitValue = u_charDigitValue(fC.fChar);
947 U_ASSERT(digitValue >= 0);
948 fIntervalLow = fIntervalLow*10 + digitValue;
949 if (fIntervalLow < 0) {
950 error(U_REGEX_NUMBER_TOO_BIG);
951 }
952 }
953 break;
954
955 case doIntervalUpperDigit:
956 // Scanned a digit from the upper value of an {lower,upper} interval
957 {
958 if (fIntervalUpper < 0) {
959 fIntervalUpper = 0;
960 }
961 int32_t digitValue = u_charDigitValue(fC.fChar);
962 U_ASSERT(digitValue >= 0);
963 fIntervalUpper = fIntervalUpper*10 + digitValue;
964 if (fIntervalUpper < 0) {
965 error(U_REGEX_NUMBER_TOO_BIG);
966 }
967 }
968 break;
969
970 case doIntervalSame:
971 // Scanned a single value interval like {27}. Upper = Lower.
972 fIntervalUpper = fIntervalLow;
973 break;
974
975 case doInterval:
976 // Finished scanning a normal {lower,upper} interval. Generate the code for it.
977 if (compileInlineInterval() == FALSE) {
978 compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
979 }
980 break;
981
982 case doPossessiveInterval:
983 // Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it.
984 {
985 // Remember the loc for the top of the block being looped over.
986 // (Can not reserve a slot in the compiled pattern at this time, becuase
987 // compileInterval needs to reserve also, and blockTopLoc can only reserve
988 // once per block.)
989 int32_t topLoc = blockTopLoc(FALSE);
990
991 // Produce normal looping code.
992 compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
993
994 // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
995 // just as if the loop was inclosed in atomic parentheses.
996
997 // First the STO_SP before the start of the loop
998 insertOp(topLoc);
999 int32_t varLoc = fRXPat->fDataSize; // Reserve a data location for saving the
1000 fRXPat->fDataSize += 1; // state stack ptr.
1001 int32_t op = URX_BUILD(URX_STO_SP, varLoc);
1002 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1003
1004 int32_t loopOp = fRXPat->fCompiledPat->popi();
1005 U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
1006 loopOp++; // point LoopOp after the just-inserted STO_SP
1007 fRXPat->fCompiledPat->push(loopOp, *fStatus);
1008
1009 // Then the LD_SP after the end of the loop
1010 op = URX_BUILD(URX_LD_SP, varLoc);
1011 fRXPat->fCompiledPat->addElement(op, *fStatus);
1012 }
1013
1014 break;
1015
1016 case doNGInterval:
1017 // Finished scanning a non-greedy {lower,upper}? interval. Generate the code for it.
1018 compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
1019 break;
1020
1021 case doIntervalError:
1022 error(U_REGEX_BAD_INTERVAL);
1023 break;
1024
1025 case doLiteralChar:
1026 // We've just scanned a "normal" character from the pattern,
1027 literalChar(fC.fChar);
1028 break;
1029
1030
1031
1032 case doDotAny:
1033 // scanned a ".", match any single character.
1034 {
1035 int32_t op;
1036 if (fModeFlags & UREGEX_DOTALL) {
1037 op = URX_BUILD(URX_DOTANY_ALL, 0);
1038 } else {
1039 op = URX_BUILD(URX_DOTANY, 0);
1040 }
1041 fRXPat->fCompiledPat->addElement(op, *fStatus);
1042 }
1043 break;
1044
1045 case doCaret:
1046 {
1047 int32_t op = (fModeFlags & UREGEX_MULTILINE)? URX_CARET_M : URX_CARET;
1048 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
1049 }
1050 break;
1051
1052
1053 case doDollar:
1054 {
1055 int32_t op = (fModeFlags & UREGEX_MULTILINE)? URX_DOLLAR_M : URX_DOLLAR;
1056 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
1057 }
1058 break;
1059
1060 case doBackslashA:
1061 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_CARET, 0), *fStatus);
1062 break;
1063
1064 case doBackslashB:
1065 {
1066 #if UCONFIG_NO_BREAK_ITERATION==1
1067 if (fModeFlags & UREGEX_UWORD) {
1068 error(U_UNSUPPORTED_ERROR);
1069 }
1070 #endif
1071 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1072 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 1), *fStatus);
1073 }
1074 break;
1075
1076 case doBackslashb:
1077 {
1078 #if UCONFIG_NO_BREAK_ITERATION==1
1079 if (fModeFlags & UREGEX_UWORD) {
1080 error(U_UNSUPPORTED_ERROR);
1081 }
1082 #endif
1083 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1084 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
1085 }
1086 break;
1087
1088 case doBackslashD:
1089 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 1), *fStatus);
1090 break;
1091
1092 case doBackslashd:
1093 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 0), *fStatus);
1094 break;
1095
1096 case doBackslashG:
1097 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_G, 0), *fStatus);
1098 break;
1099
1100 case doBackslashS:
1101 fRXPat->fCompiledPat->addElement(
1102 URX_BUILD(URX_STAT_SETREF_N, URX_ISSPACE_SET), *fStatus);
1103 break;
1104
1105 case doBackslashs:
1106 fRXPat->fCompiledPat->addElement(
1107 URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET), *fStatus);
1108 break;
1109
1110 case doBackslashW:
1111 fRXPat->fCompiledPat->addElement(
1112 URX_BUILD(URX_STAT_SETREF_N, URX_ISWORD_SET), *fStatus);
1113 break;
1114
1115 case doBackslashw:
1116 fRXPat->fCompiledPat->addElement(
1117 URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET), *fStatus);
1118 break;
1119
1120 case doBackslashX:
1121 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_X, 0), *fStatus);
1122 break;
1123
1124
1125 case doBackslashZ:
1126 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus);
1127 break;
1128
1129 case doBackslashz:
1130 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 0), *fStatus);
1131 break;
1132
1133 case doEscapeError:
1134 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1135 break;
1136
1137 case doExit:
1138 returnVal = FALSE;
1139 break;
1140
1141 case doProperty:
1142 {
1143 UnicodeSet *theSet = scanProp();
1144 compileSet(theSet);
1145 }
1146 break;
1147
1148
1149 case doScanUnicodeSet:
1150 {
1151 UnicodeSet *theSet = scanSet();
1152 compileSet(theSet);
1153 }
1154 break;
1155
1156 case doEnterQuoteMode:
1157 // Just scanned a \Q. Put character scanner into quote mode.
1158 fQuoteMode = TRUE;
1159 break;
1160
1161 case doBackRef:
1162 // BackReference. Somewhat unusual in that the front-end can not completely parse
1163 // the regular expression, because the number of digits to be consumed
1164 // depends on the number of capture groups that have been defined. So
1165 // we have to do it here instead.
1166 {
1167 int32_t numCaptureGroups = fRXPat->fGroupMap->size();
1168 int32_t groupNum = 0;
1169 UChar32 c = fC.fChar;
1170
1171 for (;;) {
1172 // Loop once per digit, for max allowed number of digits in a back reference.
1173 int32_t digit = u_charDigitValue(c);
1174 groupNum = groupNum * 10 + digit;
1175 if (groupNum >= numCaptureGroups) {
1176 break;
1177 }
1178 c = peekCharLL();
1179 if (RegexStaticSets::gStaticSets->fRuleDigits->contains(c) == FALSE) {
1180 break;
1181 }
1182 nextCharLL();
1183 }
1184
1185 // Scan of the back reference in the source regexp is complete. Now generate
1186 // the compiled code for it.
1187 // Because capture groups can be forward-referenced by back-references,
1188 // we fill the operand with the capture group number. At the end
1189 // of compilation, it will be changed to the variable's location.
1190 U_ASSERT(groupNum > 0);
1191 int32_t op;
1192 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1193 op = URX_BUILD(URX_BACKREF_I, groupNum);
1194 } else {
1195 op = URX_BUILD(URX_BACKREF, groupNum);
1196 }
1197 fRXPat->fCompiledPat->addElement(op, *fStatus);
1198 }
1199 break;
1200
1201
1202 case doPossessivePlus:
1203 // Possessive ++ quantifier.
1204 // Compiles to
1205 // 1. STO_SP
1206 // 2. body of stuff being iterated over
1207 // 3. STATE_SAVE 5
1208 // 4. JMP 2
1209 // 5. LD_SP
1210 // 6. ...
1211 //
1212 // Note: TODO: This is pretty inefficient. A mass of saved state is built up
1213 // then unconditionally discarded. Perhaps introduce a new opcode
1214 //
1215 {
1216 // Emit the STO_SP
1217 int32_t topLoc = blockTopLoc(TRUE);
1218 int32_t stoLoc = fRXPat->fDataSize;
1219 fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr.
1220 int32_t op = URX_BUILD(URX_STO_SP, stoLoc);
1221 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1222
1223 // Emit the STATE_SAVE
1224 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
1225 fRXPat->fCompiledPat->addElement(op, *fStatus);
1226
1227 // Emit the JMP
1228 op = URX_BUILD(URX_JMP, topLoc+1);
1229 fRXPat->fCompiledPat->addElement(op, *fStatus);
1230
1231 // Emit the LD_SP
1232 op = URX_BUILD(URX_LD_SP, stoLoc);
1233 fRXPat->fCompiledPat->addElement(op, *fStatus);
1234 }
1235 break;
1236
1237 case doPossessiveStar:
1238 // Possessive *+ quantifier.
1239 // Compiles to
1240 // 1. STO_SP loc
1241 // 2. STATE_SAVE 5
1242 // 3. body of stuff being iterated over
1243 // 4. JMP 2
1244 // 5. LD_SP loc
1245 // 6 ...
1246 // TODO: do something to cut back the state stack each time through the loop.
1247 {
1248 // Reserve two slots at the top of the block.
1249 int32_t topLoc = blockTopLoc(TRUE);
1250 insertOp(topLoc);
1251
1252 // emit STO_SP loc
1253 int32_t stoLoc = fRXPat->fDataSize;
1254 fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr.
1255 int32_t op = URX_BUILD(URX_STO_SP, stoLoc);
1256 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1257
1258 // Emit the SAVE_STATE 5
1259 int32_t L7 = fRXPat->fCompiledPat->size()+1;
1260 op = URX_BUILD(URX_STATE_SAVE, L7);
1261 fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1262
1263 // Append the JMP operation.
1264 op = URX_BUILD(URX_JMP, topLoc+1);
1265 fRXPat->fCompiledPat->addElement(op, *fStatus);
1266
1267 // Emit the LD_SP loc
1268 op = URX_BUILD(URX_LD_SP, stoLoc);
1269 fRXPat->fCompiledPat->addElement(op, *fStatus);
1270 }
1271 break;
1272
1273 case doPossessiveOpt:
1274 // Possessive ?+ quantifier.
1275 // Compiles to
1276 // 1. STO_SP loc
1277 // 2. SAVE_STATE 5
1278 // 3. body of optional block
1279 // 4. LD_SP loc
1280 // 5. ...
1281 //
1282 {
1283 // Reserve two slots at the top of the block.
1284 int32_t topLoc = blockTopLoc(TRUE);
1285 insertOp(topLoc);
1286
1287 // Emit the STO_SP
1288 int32_t stoLoc = fRXPat->fDataSize;
1289 fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr.
1290 int32_t op = URX_BUILD(URX_STO_SP, stoLoc);
1291 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1292
1293 // Emit the SAVE_STATE
1294 int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
1295 op = URX_BUILD(URX_STATE_SAVE, continueLoc);
1296 fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1297
1298 // Emit the LD_SP
1299 op = URX_BUILD(URX_LD_SP, stoLoc);
1300 fRXPat->fCompiledPat->addElement(op, *fStatus);
1301 }
1302 break;
1303
1304
1305 case doBeginMatchMode:
1306 fNewModeFlags = fModeFlags;
1307 fSetModeFlag = TRUE;
1308 break;
1309
1310 case doMatchMode: // (?i) and similar
1311 {
1312 int32_t bit = 0;
1313 switch (fC.fChar) {
1314 case 0x69: /* 'i' */ bit = UREGEX_CASE_INSENSITIVE; break;
1315 case 0x6d: /* 'm' */ bit = UREGEX_MULTILINE; break;
1316 case 0x73: /* 's' */ bit = UREGEX_DOTALL; break;
1317 case 0x77: /* 'w' */ bit = UREGEX_UWORD; break;
1318 case 0x78: /* 'x' */ bit = UREGEX_COMMENTS; break;
1319 case 0x2d: /* '-' */ fSetModeFlag = FALSE; break;
1320 default:
1321 U_ASSERT(FALSE); // Should never happen. Other chars are filtered out
1322 // by the scanner.
1323 }
1324 if (fSetModeFlag) {
1325 fNewModeFlags |= bit;
1326 } else {
1327 fNewModeFlags &= ~bit;
1328 }
1329 }
1330 break;
1331
1332 case doSetMatchMode:
1333 // We've got a (?i) or similar. The match mode is being changed, but
1334 // the change is not scoped to a parenthesized block.
1335 U_ASSERT(fNewModeFlags < 0);
1336 fModeFlags = fNewModeFlags;
1337
1338 // Prevent any string from spanning across the change of match mode.
1339 // Otherwise the pattern "abc(?i)def" would make a single string of "abcdef"
1340 fixLiterals();
1341 break;
1342
1343
1344 case doMatchModeParen:
1345 // We've got a (?i: or similar. Begin a parenthesized block, save old
1346 // mode flags so they can be restored at the close of the block.
1347 //
1348 // Compile to a
1349 // - NOP, which later may be replaced by a save-state if the
1350 // parenthesized group gets a * quantifier, followed by
1351 // - NOP, which may later be replaced by a save-state if there
1352 // is an '|' alternation within the parens.
1353 {
1354 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
1355 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
1356
1357 // On the Parentheses stack, start a new frame and add the postions
1358 // of the two NOPs (a normal non-capturing () frame, except for the
1359 // saving of the orignal mode flags.)
1360 fParenStack.push(fModeFlags, *fStatus);
1361 fParenStack.push(flags, *fStatus); // Frame Marker
1362 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP
1363 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
1364
1365 // Set the current mode flags to the new values.
1366 U_ASSERT(fNewModeFlags < 0);
1367 fModeFlags = fNewModeFlags;
1368 }
1369 break;
1370
1371 case doBadModeFlag:
1372 error(U_REGEX_INVALID_FLAG);
1373 break;
1374
1375 case doSuppressComments:
1376 // We have just scanned a '(?'. We now need to prevent the character scanner from
1377 // treating a '#' as a to-the-end-of-line comment.
1378 // (This Perl compatibility just gets uglier and uglier to do...)
1379 fEOLComments = FALSE;
1380 break;
1381
1382
1383
1384 default:
1385 U_ASSERT(FALSE);
1386 error(U_REGEX_INTERNAL_ERROR);
1387 break;
1388 }
1389
1390 if (U_FAILURE(*fStatus)) {
1391 returnVal = FALSE;
1392 }
1393
1394 return returnVal;
1395 }
1396
1397
1398
1399 //------------------------------------------------------------------------------
1400 //
1401 // literalChar We've encountered a literal character from the pattern,
1402 // or an escape sequence that reduces to a character.
1403 // Add it to the string containing all literal chars/strings from
1404 // the pattern.
1405 // If we are in a pattern string already, add the new char to it.
1406 // If we aren't in a pattern string, begin one now.
1407 //
1408 //------------------------------------------------------------------------------
1409 void RegexCompile::literalChar(UChar32 c) {
1410 int32_t op; // An operation in the compiled pattern.
1411 int32_t opType;
1412 int32_t patternLoc; // A position in the compiled pattern.
1413 int32_t stringLen;
1414
1415
1416 // If the last thing compiled into the pattern was not a literal char,
1417 // force this new literal char to begin a new string, and not append to the previous.
1418 op = fRXPat->fCompiledPat->lastElementi();
1419 opType = URX_TYPE(op);
1420 if (!(opType == URX_STRING_LEN || opType == URX_ONECHAR || opType == URX_ONECHAR_I)) {
1421 fixLiterals();
1422 }
1423
1424 if (fStringOpStart == -1) {
1425 // First char of a string in the pattern.
1426 // Emit a OneChar op into the compiled pattern.
1427 emitONE_CHAR(c);
1428
1429 // Also add it to the string pool, in case we get a second adjacent literal
1430 // and want to change form ONE_CHAR to STRING
1431 fStringOpStart = fRXPat->fLiteralText.length();
1432 fRXPat->fLiteralText.append(c);
1433 return;
1434 }
1435
1436 // We are adding onto an existing string
1437 fRXPat->fLiteralText.append(c);
1438
1439 op = fRXPat->fCompiledPat->lastElementi();
1440 opType = URX_TYPE(op);
1441 U_ASSERT(opType == URX_ONECHAR || opType == URX_ONECHAR_I || opType == URX_STRING_LEN);
1442
1443 // If the most recently emitted op is a URX_ONECHAR,
1444 if (opType == URX_ONECHAR || opType == URX_ONECHAR_I) {
1445 if (U16_IS_TRAIL(c) && U16_IS_LEAD(URX_VAL(op))) {
1446 // The most recently emitted op is a ONECHAR that was the first half
1447 // of a surrogate pair. Update the ONECHAR's operand to be the
1448 // supplementary code point resulting from both halves of the pair.
1449 c = U16_GET_SUPPLEMENTARY(URX_VAL(op), c);
1450 op = URX_BUILD(opType, c);
1451 patternLoc = fRXPat->fCompiledPat->size() - 1;
1452 fRXPat->fCompiledPat->setElementAt(op, patternLoc);
1453 return;
1454 }
1455
1456 // The most recently emitted op is a ONECHAR.
1457 // We've now received another adjacent char. Change the ONECHAR op
1458 // to a string op.
1459 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1460 op = URX_BUILD(URX_STRING_I, fStringOpStart);
1461 } else {
1462 op = URX_BUILD(URX_STRING, fStringOpStart);
1463 }
1464 patternLoc = fRXPat->fCompiledPat->size() - 1;
1465 fRXPat->fCompiledPat->setElementAt(op, patternLoc);
1466 op = URX_BUILD(URX_STRING_LEN, 0);
1467 fRXPat->fCompiledPat->addElement(op, *fStatus);
1468 }
1469
1470 // The pattern contains a URX_SRING / URX_STRING_LEN. Update the
1471 // string length to reflect the new char we just added to the string.
1472 stringLen = fRXPat->fLiteralText.length() - fStringOpStart;
1473 op = URX_BUILD(URX_STRING_LEN, stringLen);
1474 patternLoc = fRXPat->fCompiledPat->size() - 1;
1475 fRXPat->fCompiledPat->setElementAt(op, patternLoc);
1476 }
1477
1478
1479
1480 //------------------------------------------------------------------------------
1481 //
1482 // emitONE_CHAR emit a ONE_CHAR op into the generated code.
1483 // Choose cased or uncased version, depending on the
1484 // match mode and whether the character itself is cased.
1485 //
1486 //------------------------------------------------------------------------------
1487 void RegexCompile::emitONE_CHAR(UChar32 c) {
1488 int32_t op;
1489 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
1490 u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
1491 // We have a cased character, and are in case insensitive matching mode.
1492 c = u_foldCase(c, U_FOLD_CASE_DEFAULT);
1493 op = URX_BUILD(URX_ONECHAR_I, c);
1494 } else {
1495 // Uncased char, or case sensitive match mode.
1496 // Either way, just generate a literal compare of the char.
1497 op = URX_BUILD(URX_ONECHAR, c);
1498 }
1499 fRXPat->fCompiledPat->addElement(op, *fStatus);
1500 }
1501
1502
1503 //------------------------------------------------------------------------------
1504 //
1505 // fixLiterals When compiling something that can follow a literal
1506 // string in a pattern, we need to "fix" any preceding
1507 // string, which will cause any subsequent literals to
1508 // begin a new string, rather than appending to the
1509 // old one.
1510 //
1511 // Optionally, split the last char of the string off into
1512 // a single "ONE_CHAR" operation, so that quantifiers can
1513 // apply to that char alone. Example: abc*
1514 // The * must apply to the 'c' only.
1515 //
1516 //------------------------------------------------------------------------------
1517 void RegexCompile::fixLiterals(UBool split) {
1518 int32_t stringStart = fStringOpStart; // start index of the current literal string
1519 int32_t op; // An op from/for the compiled pattern.
1520 int32_t opType; // An opcode type from the compiled pattern.
1521 int32_t stringLastCharIdx;
1522 UChar32 lastChar;
1523 int32_t stringNextToLastCharIdx;
1524 UChar32 nextToLastChar;
1525 int32_t stringLen;
1526
1527 fStringOpStart = -1;
1528 if (!split) {
1529 return;
1530 }
1531
1532 // Split: We need to ensure that the last item in the compiled pattern does
1533 // not refer to a literal string of more than one char. If it does,
1534 // separate the last char from the rest of the string.
1535
1536 // If the last operation from the compiled pattern is not a string,
1537 // nothing needs to be done
1538 op = fRXPat->fCompiledPat->lastElementi();
1539 opType = URX_TYPE(op);
1540 if (opType != URX_STRING_LEN) {
1541 return;
1542 }
1543 stringLen = URX_VAL(op);
1544
1545 //
1546 // Find the position of the last code point in the string (might be a surrogate pair)
1547 //
1548 stringLastCharIdx = fRXPat->fLiteralText.length();
1549 stringLastCharIdx = fRXPat->fLiteralText.moveIndex32(stringLastCharIdx, -1);
1550 lastChar = fRXPat->fLiteralText.char32At(stringLastCharIdx);
1551
1552 // The string should always be at least two code points long, meaning that there
1553 // should be something before the last char position that we just found.
1554 U_ASSERT(stringLastCharIdx > stringStart);
1555 stringNextToLastCharIdx = fRXPat->fLiteralText.moveIndex32(stringLastCharIdx, -1);
1556 U_ASSERT(stringNextToLastCharIdx >= stringStart);
1557 nextToLastChar = fRXPat->fLiteralText.char32At(stringNextToLastCharIdx);
1558
1559 if (stringNextToLastCharIdx > stringStart) {
1560 // The length of string remaining after removing one char is two or more.
1561 // Leave the string in the compiled pattern, shorten it by one char,
1562 // and append a URX_ONECHAR op for the last char.
1563 stringLen -= (fRXPat->fLiteralText.length() - stringLastCharIdx);
1564 op = URX_BUILD(URX_STRING_LEN, stringLen);
1565 fRXPat->fCompiledPat->setElementAt(op, fRXPat->fCompiledPat->size() -1);
1566 emitONE_CHAR(lastChar);
1567 } else {
1568 // The original string consisted of exactly two characters. Replace
1569 // the existing compiled URX_STRING/URX_STRING_LEN ops with a pair
1570 // of URX_ONECHARs.
1571 fRXPat->fCompiledPat->setSize(fRXPat->fCompiledPat->size() -2);
1572 emitONE_CHAR(nextToLastChar);
1573 emitONE_CHAR(lastChar);
1574 }
1575 }
1576
1577
1578
1579
1580
1581
1582 //------------------------------------------------------------------------------
1583 //
1584 // insertOp() Insert a slot for a new opcode into the already
1585 // compiled pattern code.
1586 //
1587 // Fill the slot with a NOP. Our caller will replace it
1588 // with what they really wanted.
1589 //
1590 //------------------------------------------------------------------------------
1591 void RegexCompile::insertOp(int32_t where) {
1592 UVector32 *code = fRXPat->fCompiledPat;
1593 U_ASSERT(where>0 && where < code->size());
1594
1595 int32_t nop = URX_BUILD(URX_NOP, 0);
1596 code->insertElementAt(nop, where, *fStatus);
1597
1598 // Walk through the pattern, looking for any ops with targets that
1599 // were moved down by the insert. Fix them.
1600 int32_t loc;
1601 for (loc=0; loc<code->size(); loc++) {
1602 int32_t op = code->elementAti(loc);
1603 int32_t opType = URX_TYPE(op);
1604 int32_t opValue = URX_VAL(op);
1605 if ((opType == URX_JMP ||
1606 opType == URX_JMPX ||
1607 opType == URX_STATE_SAVE ||
1608 opType == URX_CTR_LOOP ||
1609 opType == URX_CTR_LOOP_NG ||
1610 opType == URX_JMP_SAV ||
1611 opType == URX_RELOC_OPRND) && opValue > where) {
1612 // Target location for this opcode is after the insertion point and
1613 // needs to be incremented to adjust for the insertion.
1614 opValue++;
1615 op = URX_BUILD(opType, opValue);
1616 code->setElementAt(op, loc);
1617 }
1618 }
1619
1620 // Now fix up the parentheses stack. All positive values in it are locations in
1621 // the compiled pattern. (Negative values are frame boundaries, and don't need fixing.)
1622 for (loc=0; loc<fParenStack.size(); loc++) {
1623 int32_t x = fParenStack.elementAti(loc);
1624 U_ASSERT(x < code->size());
1625 if (x>where) {
1626 x++;
1627 fParenStack.setElementAt(x, loc);
1628 }
1629 }
1630
1631 if (fMatchCloseParen > where) {
1632 fMatchCloseParen++;
1633 }
1634 if (fMatchOpenParen > where) {
1635 fMatchOpenParen++;
1636 }
1637 }
1638
1639
1640
1641 //------------------------------------------------------------------------------
1642 //
1643 // blockTopLoc() Find or create a location in the compiled pattern
1644 // at the start of the operation or block that has
1645 // just been compiled. Needed when a quantifier (* or
1646 // whatever) appears, and we need to add an operation
1647 // at the start of the thing being quantified.
1648 //
1649 // (Parenthesized Blocks) have a slot with a NOP that
1650 // is reserved for this purpose. .* or similar don't
1651 // and a slot needs to be added.
1652 //
1653 // parameter reserveLoc : TRUE - ensure that there is space to add an opcode
1654 // at the returned location.
1655 // FALSE - just return the address,
1656 // do not reserve a location there.
1657 //
1658 //------------------------------------------------------------------------------
1659 int32_t RegexCompile::blockTopLoc(UBool reserveLoc) {
1660 int32_t theLoc;
1661 if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
1662 {
1663 // The item just processed is a parenthesized block.
1664 theLoc = fMatchOpenParen; // A slot is already reserved for us.
1665 U_ASSERT(theLoc > 0);
1666 U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP);
1667 }
1668 else {
1669 // Item just compiled is a single thing, a ".", or a single char, or a set reference.
1670 // No slot for STATE_SAVE was pre-reserved in the compiled code.
1671 // We need to make space now.
1672 fixLiterals(TRUE); // If last item was a string, separate the last char.
1673 theLoc = fRXPat->fCompiledPat->size()-1;
1674 if (reserveLoc) {
1675 /*int32_t opAtTheLoc = fRXPat->fCompiledPat->elementAti(theLoc);*/
1676 int32_t nop = URX_BUILD(URX_NOP, 0);
1677 fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
1678 }
1679 }
1680 return theLoc;
1681 }
1682
1683
1684
1685 //------------------------------------------------------------------------------
1686 //
1687 // handleCloseParen When compiling a close paren, we need to go back
1688 // and fix up any JMP or SAVE operations within the
1689 // parenthesized block that need to target the end
1690 // of the block. The locations of these are kept on
1691 // the paretheses stack.
1692 //
1693 // This function is called both when encountering a
1694 // real ) and at the end of the pattern.
1695 //
1696 //------------------------------------------------------------------------------
1697 void RegexCompile::handleCloseParen() {
1698 int32_t patIdx;
1699 int32_t patOp;
1700 if (fParenStack.size() <= 0) {
1701 error(U_REGEX_MISMATCHED_PAREN);
1702 return;
1703 }
1704
1705 // Force any literal chars that may follow the close paren to start a new string,
1706 // and not attach to any preceding it.
1707 fixLiterals(FALSE);
1708
1709 // Fixup any operations within the just-closed parenthesized group
1710 // that need to reference the end of the (block).
1711 // (The first one popped from the stack is an unused slot for
1712 // alternation (OR) state save, but applying the fixup to it does no harm.)
1713 for (;;) {
1714 patIdx = fParenStack.popi();
1715 if (patIdx < 0) {
1716 // value < 0 flags the start of the frame on the paren stack.
1717 break;
1718 }
1719 U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size());
1720 patOp = fRXPat->fCompiledPat->elementAti(patIdx);
1721 U_ASSERT(URX_VAL(patOp) == 0); // Branch target for JMP should not be set.
1722 patOp |= fRXPat->fCompiledPat->size(); // Set it now.
1723 fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
1724 fMatchOpenParen = patIdx;
1725 }
1726
1727 // At the close of any parenthesized block, restore the match mode flags to
1728 // the value they had at the open paren. Saved value is
1729 // at the top of the paren stack.
1730 fModeFlags = fParenStack.popi();
1731 U_ASSERT(fModeFlags < 0);
1732
1733 // DO any additional fixups, depending on the specific kind of
1734 // parentesized grouping this is
1735
1736 switch (patIdx) {
1737 case plain:
1738 case flags:
1739 // No additional fixups required.
1740 // (Grouping-only parentheses)
1741 break;
1742 case capturing:
1743 // Capturing Parentheses.
1744 // Insert a End Capture op into the pattern.
1745 // The frame offset of the variables for this cg is obtained from the
1746 // start capture op and put it into the end-capture op.
1747 {
1748 int32_t captureOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
1749 U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
1750
1751 int32_t frameVarLocation = URX_VAL(captureOp);
1752 int32_t endCaptureOp = URX_BUILD(URX_END_CAPTURE, frameVarLocation);
1753 fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus);
1754 }
1755 break;
1756 case atomic:
1757 // Atomic Parenthesis.
1758 // Insert a LD_SP operation to restore the state stack to the position
1759 // it was when the atomic parens were entered.
1760 {
1761 int32_t stoOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
1762 U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
1763 int32_t stoLoc = URX_VAL(stoOp);
1764 int32_t ldOp = URX_BUILD(URX_LD_SP, stoLoc);
1765 fRXPat->fCompiledPat->addElement(ldOp, *fStatus);
1766 }
1767 break;
1768
1769 case lookAhead:
1770 {
1771 int32_t startOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
1772 U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
1773 int32_t dataLoc = URX_VAL(startOp);
1774 int32_t op = URX_BUILD(URX_LA_END, dataLoc);
1775 fRXPat->fCompiledPat->addElement(op, *fStatus);
1776 }
1777 break;
1778
1779 case negLookAhead:
1780 {
1781 // See comment at doOpenLookAheadNeg
1782 int32_t startOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
1783 U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
1784 int32_t dataLoc = URX_VAL(startOp);
1785 int32_t op = URX_BUILD(URX_LA_END, dataLoc);
1786 fRXPat->fCompiledPat->addElement(op, *fStatus);
1787 op = URX_BUILD(URX_FAIL, 0);
1788 fRXPat->fCompiledPat->addElement(op, *fStatus);
1789
1790 // Patch the URX_SAVE near the top of the block.
1791 int32_t saveOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
1792 U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
1793 int32_t dest = fRXPat->fCompiledPat->size();
1794 saveOp = URX_BUILD(URX_STATE_SAVE, dest);
1795 fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
1796 }
1797 break;
1798
1799 case lookBehind:
1800 {
1801 // See comment at doOpenLookBehind.
1802
1803 // Append the URX_LB_END and URX_LA_END to the compiled pattern.
1804 int32_t startOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4);
1805 U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
1806 int32_t dataLoc = URX_VAL(startOp);
1807 int32_t op = URX_BUILD(URX_LB_END, dataLoc);
1808 fRXPat->fCompiledPat->addElement(op, *fStatus);
1809 op = URX_BUILD(URX_LA_END, dataLoc);
1810 fRXPat->fCompiledPat->addElement(op, *fStatus);
1811
1812 // Determine the min and max bounds for the length of the
1813 // string that the pattern can match.
1814 // An unbounded upper limit is an error.
1815 int32_t patEnd = fRXPat->fCompiledPat->size() - 1;
1816 int32_t minML = minMatchLength(fMatchOpenParen, patEnd);
1817 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd);
1818 if (maxML == INT32_MAX) {
1819 error(U_REGEX_LOOK_BEHIND_LIMIT);
1820 break;
1821 }
1822 U_ASSERT(minML <= maxML);
1823
1824 // Insert the min and max match len bounds into the URX_LB_CONT op that
1825 // appears at the top of the look-behind block, at location fMatchOpenParen+1
1826 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-2);
1827 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-1);
1828
1829 }
1830 break;
1831
1832
1833
1834 case lookBehindN:
1835 {
1836 // See comment at doOpenLookBehindNeg.
1837
1838 // Append the URX_LBN_END to the compiled pattern.
1839 int32_t startOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
1840 U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
1841 int32_t dataLoc = URX_VAL(startOp);
1842 int32_t op = URX_BUILD(URX_LBN_END, dataLoc);
1843 fRXPat->fCompiledPat->addElement(op, *fStatus);
1844
1845 // Determine the min and max bounds for the length of the
1846 // string that the pattern can match.
1847 // An unbounded upper limit is an error.
1848 int32_t patEnd = fRXPat->fCompiledPat->size() - 1;
1849 int32_t minML = minMatchLength(fMatchOpenParen, patEnd);
1850 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd);
1851 if (maxML == INT32_MAX) {
1852 error(U_REGEX_LOOK_BEHIND_LIMIT);
1853 break;
1854 }
1855 U_ASSERT(minML <= maxML);
1856
1857 // Insert the min and max match len bounds into the URX_LB_CONT op that
1858 // appears at the top of the look-behind block, at location fMatchOpenParen+1
1859 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-3);
1860 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-2);
1861
1862 // Insert the pattern location to continue at after a successful match
1863 // as the last operand of the URX_LBN_CONT
1864 op = URX_BUILD(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
1865 fRXPat->fCompiledPat->setElementAt(op, fMatchOpenParen-1);
1866 }
1867 break;
1868
1869
1870
1871 default:
1872 U_ASSERT(FALSE);
1873 }
1874
1875 // remember the next location in the compiled pattern.
1876 // The compilation of Quantifiers will look at this to see whether its looping
1877 // over a parenthesized block or a single item
1878 fMatchCloseParen = fRXPat->fCompiledPat->size();
1879 }
1880
1881
1882
1883 //------------------------------------------------------------------------------
1884 //
1885 // compileSet Compile the pattern operations for a reference to a
1886 // UnicodeSet.
1887 //
1888 //------------------------------------------------------------------------------
1889 void RegexCompile::compileSet(UnicodeSet *theSet)
1890 {
1891 if (theSet == NULL) {
1892 return;
1893 }
1894 int32_t setSize = theSet->size();
1895 UChar32 firstSetChar = theSet->charAt(0);
1896 if (firstSetChar == -1) {
1897 // Sets that contain only strings, but no individual chars,
1898 // will end up here.
1899 error(U_REGEX_SET_CONTAINS_STRING);
1900 setSize = 0;
1901 }
1902
1903 switch (setSize) {
1904 case 0:
1905 {
1906 // Set of no elements. Always fails to match.
1907 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKTRACK, 0), *fStatus);
1908 delete theSet;
1909 }
1910 break;
1911
1912 case 1:
1913 {
1914 // The set contains only a single code point. Put it into
1915 // the compiled pattern as a single char operation rather
1916 // than a set, and discard the set itself.
1917 literalChar(firstSetChar);
1918 delete theSet;
1919 }
1920 break;
1921
1922 default:
1923 {
1924 // The set contains two or more chars. (the normal case)
1925 // Put it into the compiled pattern as a set.
1926 int32_t setNumber = fRXPat->fSets->size();
1927 fRXPat->fSets->addElement(theSet, *fStatus);
1928 int32_t setOp = URX_BUILD(URX_SETREF, setNumber);
1929 fRXPat->fCompiledPat->addElement(setOp, *fStatus);
1930 }
1931 }
1932 }
1933
1934
1935 //------------------------------------------------------------------------------
1936 //
1937 // compileInterval Generate the code for a {min, max} style interval quantifier.
1938 // Except for the specific opcodes used, the code is the same
1939 // for all three types (greedy, non-greedy, possessive) of
1940 // intervals. The opcodes are supplied as parameters.
1941 //
1942 // The code for interval loops has this form:
1943 // 0 CTR_INIT counter loc (in stack frame)
1944 // 1 5 patt address of CTR_LOOP at bottom of block
1945 // 2 min count
1946 // 3 max count (-1 for unbounded)
1947 // 4 ... block to be iterated over
1948 // 5 CTR_LOOP
1949 //
1950 // In
1951 //------------------------------------------------------------------------------
1952 void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp)
1953 {
1954 // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
1955 // four slots in the compiled code. Reserve them.
1956 int32_t topOfBlock = blockTopLoc(TRUE);
1957 insertOp(topOfBlock);
1958 insertOp(topOfBlock);
1959 insertOp(topOfBlock);
1960
1961 // The operands for the CTR_INIT opcode include the index in the matcher data
1962 // of the counter. Allocate it now.
1963 int32_t counterLoc = fRXPat->fFrameSize;
1964 fRXPat->fFrameSize++;
1965
1966 int32_t op = URX_BUILD(InitOp, counterLoc);
1967 fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
1968
1969 // The second operand of CTR_INIT is the location following the end of the loop.
1970 // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
1971 // compilation of something later on causes the code to grow and the target
1972 // position to move.
1973 int32_t loopEnd = fRXPat->fCompiledPat->size();
1974 op = URX_BUILD(URX_RELOC_OPRND, loopEnd);
1975 fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
1976
1977 // Followed by the min and max counts.
1978 fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
1979 fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
1980
1981 // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op.
1982 // Goes at end of the block being looped over, so just append to the code so far.
1983 op = URX_BUILD(LoopOp, topOfBlock);
1984 fRXPat->fCompiledPat->addElement(op, *fStatus);
1985
1986 if ((fIntervalLow & 0xff000000) != 0 ||
1987 fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0) {
1988 error(U_REGEX_NUMBER_TOO_BIG);
1989 }
1990
1991 if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
1992 error(U_REGEX_MAX_LT_MIN);
1993 }
1994 }
1995
1996
1997
1998 UBool RegexCompile::compileInlineInterval() {
1999 if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) {
2000 // Too big to inline. Fail, which will cause looping code to be generated.
2001 // (Upper < Lower picks up unbounded upper and errors, both.)
2002 return FALSE;
2003 }
2004
2005 int32_t topOfBlock = blockTopLoc(FALSE);
2006 if (fIntervalUpper == 0) {
2007 // Pathological case. Attempt no matches, as if the block doesn't exist.
2008 fRXPat->fCompiledPat->setSize(topOfBlock);
2009 return TRUE;
2010 }
2011
2012 if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
2013 // The thing being repeated is not a single op, but some
2014 // more complex block. Do it as a loop, not inlines.
2015 // Note that things "repeated" a max of once are handled as inline, because
2016 // the one copy of the code already generated is just fine.
2017 return FALSE;
2018 }
2019
2020 // Pick up the opcode that is to be repeated
2021 //
2022 int32_t op = fRXPat->fCompiledPat->elementAti(topOfBlock);
2023
2024 // Compute the pattern location where the inline sequence
2025 // will end, and set up the state save op that will be needed.
2026 //
2027 int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
2028 + fIntervalUpper + (fIntervalUpper-fIntervalLow);
2029 int32_t saveOp = URX_BUILD(URX_STATE_SAVE, endOfSequenceLoc);
2030 if (fIntervalLow == 0) {
2031 insertOp(topOfBlock);
2032 fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
2033 }
2034
2035
2036
2037 // Loop, emitting the op for the thing being repeated each time.
2038 // Loop starts at 1 because one instance of the op already exists in the pattern,
2039 // it was put there when it was originally encountered.
2040 int32_t i;
2041 for (i=1; i<fIntervalUpper; i++ ) {
2042 if (i == fIntervalLow) {
2043 fRXPat->fCompiledPat->addElement(saveOp, *fStatus);
2044 }
2045 if (i > fIntervalLow) {
2046 fRXPat->fCompiledPat->addElement(saveOp, *fStatus);
2047 }
2048 fRXPat->fCompiledPat->addElement(op, *fStatus);
2049 }
2050 return TRUE;
2051 }
2052
2053
2054
2055 //------------------------------------------------------------------------------
2056 //
2057 // matchStartType Determine how a match can start.
2058 // Used to optimize find() operations.
2059 //
2060 // Operation is very similar to minMatchLength(). Walk the compiled
2061 // pattern, keeping an on-going minimum-match-length. For any
2062 // op where the min match coming in is zero, add that ops possible
2063 // starting matches to the possible starts for the overall pattern.
2064 //
2065 //------------------------------------------------------------------------------
2066 void RegexCompile::matchStartType() {
2067 if (U_FAILURE(*fStatus)) {
2068 return;
2069 }
2070
2071
2072 int32_t loc; // Location in the pattern of the current op being processed.
2073 int32_t op; // The op being processed
2074 int32_t opType; // The opcode type of the op
2075 int32_t currentLen = 0; // Minimum length of a match to this point (loc) in the pattern
2076 int32_t numInitialStrings = 0; // Number of strings encountered that could match at start.
2077
2078 UBool atStart = TRUE; // True if no part of the pattern yet encountered
2079 // could have advanced the position in a match.
2080 // (Maximum match length so far == 0)
2081
2082 // forwardedLength is a vector holding minimum-match-length values that
2083 // are propagated forward in the pattern by JMP or STATE_SAVE operations.
2084 // It must be one longer than the pattern being checked because some ops
2085 // will jmp to a end-of-block+1 location from within a block, and we must
2086 // count those when checking the block.
2087 int32_t end = fRXPat->fCompiledPat->size();
2088 UVector32 forwardedLength(end+1, *fStatus);
2089 forwardedLength.setSize(end+1);
2090 for (loc=3; loc<end; loc++) {
2091 forwardedLength.setElementAt(INT32_MAX, loc);
2092 }
2093
2094 for (loc = 3; loc<end; loc++) {
2095 op = fRXPat->fCompiledPat->elementAti(loc);
2096 opType = URX_TYPE(op);
2097
2098 // The loop is advancing linearly through the pattern.
2099 // If the op we are now at was the destination of a branch in the pattern,
2100 // and that path has a shorter minimum length than the current accumulated value,
2101 // replace the current accumulated value.
2102 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2103 if (forwardedLength.elementAti(loc) < currentLen) {
2104 currentLen = forwardedLength.elementAti(loc);
2105 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2106 }
2107
2108 switch (opType) {
2109 // Ops that don't change the total length matched
2110 case URX_RESERVED_OP:
2111 case URX_END:
2112 case URX_STRING_LEN:
2113 case URX_NOP:
2114 case URX_START_CAPTURE:
2115 case URX_END_CAPTURE:
2116 case URX_BACKSLASH_B:
2117 case URX_BACKSLASH_BU:
2118 case URX_BACKSLASH_G:
2119 case URX_BACKSLASH_Z:
2120 case URX_DOLLAR:
2121 case URX_RELOC_OPRND:
2122 case URX_STO_INP_LOC:
2123 case URX_DOLLAR_M:
2124 case URX_BACKTRACK:
2125 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
2126 case URX_BACKREF_I:
2127
2128 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
2129 case URX_LD_SP:
2130 break;
2131
2132 case URX_CARET:
2133 if (atStart) {
2134 fRXPat->fStartType = START_START;
2135 }
2136 break;
2137
2138 case URX_CARET_M:
2139 if (atStart) {
2140 fRXPat->fStartType = START_LINE;
2141 }
2142 break;
2143
2144 case URX_ONECHAR:
2145 if (currentLen == 0) {
2146 // This character could appear at the start of a match.
2147 // Add it to the set of possible starting characters.
2148 fRXPat->fInitialChars->add(URX_VAL(op));
2149 numInitialStrings += 2;
2150 }
2151 currentLen++;
2152 atStart = FALSE;
2153 break;
2154
2155
2156 case URX_SETREF:
2157 if (currentLen == 0) {
2158 int32_t sn = URX_VAL(op);
2159 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2160 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2161 fRXPat->fInitialChars->addAll(*s);
2162 numInitialStrings += 2;
2163 }
2164 currentLen++;
2165 atStart = FALSE;
2166 break;
2167
2168 case URX_LOOP_SR_I:
2169 // [Set]*, like a SETREF, above, in what it can match,
2170 // but may not match at all, so currentLen is not incremented.
2171 if (currentLen == 0) {
2172 int32_t sn = URX_VAL(op);
2173 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2174 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2175 fRXPat->fInitialChars->addAll(*s);
2176 numInitialStrings += 2;
2177 }
2178 atStart = FALSE;
2179 break;
2180
2181 case URX_LOOP_DOT_I:
2182 if (currentLen == 0) {
2183 // .* at the start of a pattern.
2184 // Any character can begin the match.
2185 fRXPat->fInitialChars->clear();
2186 fRXPat->fInitialChars->complement();
2187 numInitialStrings += 2;
2188 }
2189 atStart = FALSE;
2190 break;
2191
2192
2193 case URX_STATIC_SETREF:
2194 if (currentLen == 0) {
2195 int32_t sn = URX_VAL(op);
2196 U_ASSERT(sn>0 && sn<URX_LAST_SET);
2197 const UnicodeSet *s = fRXPat->fStaticSets[sn];
2198 fRXPat->fInitialChars->addAll(*s);
2199 numInitialStrings += 2;
2200 }
2201 currentLen++;
2202 atStart = FALSE;
2203 break;
2204
2205
2206
2207 case URX_STAT_SETREF_N:
2208 if (currentLen == 0) {
2209 int32_t sn = URX_VAL(op);
2210 const UnicodeSet *s = fRXPat->fStaticSets[sn];
2211 UnicodeSet sc(*s);
2212 sc.complement();
2213 fRXPat->fInitialChars->addAll(sc);
2214 numInitialStrings += 2;
2215 }
2216 currentLen++;
2217 atStart = FALSE;
2218 break;
2219
2220
2221
2222 case URX_BACKSLASH_D:
2223 // Digit Char
2224 if (currentLen == 0) {
2225 UnicodeSet s;
2226 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
2227 if (URX_VAL(op) != 0) {
2228 s.complement();
2229 }
2230 fRXPat->fInitialChars->addAll(s);
2231 numInitialStrings += 2;
2232 }
2233 currentLen++;
2234 atStart = FALSE;
2235 break;
2236
2237
2238 case URX_ONECHAR_I:
2239 // Case Insensitive Single Character.
2240 if (currentLen == 0) {
2241 UChar32 c = URX_VAL(op);
2242 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2243 // character may have distinct cased forms. Add all of them
2244 // to the set of possible starting match chars.
2245 UnicodeSet s(c, c);
2246 s.closeOver(USET_CASE_INSENSITIVE);
2247 fRXPat->fInitialChars->addAll(s);
2248 } else {
2249 // Char has no case variants. Just add it as-is to the
2250 // set of possible starting chars.
2251 fRXPat->fInitialChars->add(c);
2252 }
2253 numInitialStrings += 2;
2254 }
2255 currentLen++;
2256 atStart = FALSE;
2257 break;
2258
2259
2260 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
2261 case URX_DOTANY_ALL: // . matches one or two.
2262 case URX_DOTANY:
2263 case URX_DOTANY_ALL_PL:
2264 case URX_DOTANY_PL:
2265 if (currentLen == 0) {
2266 // These constructs are all bad news when they appear at the start
2267 // of a match. Any character can begin the match.
2268 fRXPat->fInitialChars->clear();
2269 fRXPat->fInitialChars->complement();
2270 numInitialStrings += 2;
2271 }
2272 currentLen++;
2273 atStart = FALSE;
2274 break;
2275
2276
2277 case URX_JMPX:
2278 loc++; // Except for extra operand on URX_JMPX, same as URX_JMP.
2279 case URX_JMP:
2280 {
2281 int32_t jmpDest = URX_VAL(op);
2282 if (jmpDest < loc) {
2283 // Loop of some kind. Can safely ignore, the worst that will happen
2284 // is that we understate the true minimum length
2285 currentLen = forwardedLength.elementAti(loc+1);
2286
2287 } else {
2288 // Forward jump. Propagate the current min length to the target loc of the jump.
2289 U_ASSERT(jmpDest <= end+1);
2290 if (forwardedLength.elementAti(jmpDest) > currentLen) {
2291 forwardedLength.setElementAt(currentLen, jmpDest);
2292 }
2293 }
2294 }
2295 atStart = FALSE;
2296 break;
2297
2298 case URX_JMP_SAV:
2299 case URX_JMP_SAV_X:
2300 // Combo of state save to the next loc, + jmp backwards.
2301 // Net effect on min. length computation is nothing.
2302 atStart = FALSE;
2303 break;
2304
2305 case URX_FAIL:
2306 // Fails are kind of like a branch, except that the min length was
2307 // propagated already, by the state save.
2308 currentLen = forwardedLength.elementAti(loc+1);
2309 atStart = FALSE;
2310 break;
2311
2312
2313 case URX_STATE_SAVE:
2314 {
2315 // State Save, for forward jumps, propagate the current minimum.
2316 // of the state save.
2317 int32_t jmpDest = URX_VAL(op);
2318 if (jmpDest > loc) {
2319 if (currentLen < forwardedLength.elementAti(jmpDest)) {
2320 forwardedLength.setElementAt(currentLen, jmpDest);
2321 }
2322 }
2323 }
2324 atStart = FALSE;
2325 break;
2326
2327
2328
2329
2330 case URX_STRING:
2331 {
2332 loc++;
2333 int32_t stringLenOp = fRXPat->fCompiledPat->elementAti(loc);
2334 int32_t stringLen = URX_VAL(stringLenOp);
2335 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
2336 U_ASSERT(stringLenOp >= 2);
2337 if (currentLen == 0) {
2338 // Add the starting character of this string to the set of possible starting
2339 // characters for this pattern.
2340 int32_t stringStartIdx = URX_VAL(op);
2341 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
2342 fRXPat->fInitialChars->add(c);
2343
2344 // Remember this string. After the entire pattern has been checked,
2345 // if nothing else is identified that can start a match, we'll use it.
2346 numInitialStrings++;
2347 fRXPat->fInitialStringIdx = stringStartIdx;
2348 fRXPat->fInitialStringLen = stringLen;
2349 }
2350
2351 currentLen += stringLen;
2352 atStart = FALSE;
2353 }
2354 break;
2355
2356 case URX_STRING_I:
2357 {
2358 // Case-insensitive string. Unlike exact-match strings, we won't
2359 // attempt a string search for possible match positions. But we
2360 // do update the set of possible starting characters.
2361 loc++;
2362 int32_t stringLenOp = fRXPat->fCompiledPat->elementAti(loc);
2363 int32_t stringLen = URX_VAL(stringLenOp);
2364 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
2365 U_ASSERT(stringLenOp >= 2);
2366 if (currentLen == 0) {
2367 // Add the starting character of this string to the set of possible starting
2368 // characters for this pattern.
2369 int32_t stringStartIdx = URX_VAL(op);
2370 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
2371 UnicodeSet s(c, c);
2372 s.closeOver(USET_CASE_INSENSITIVE);
2373 fRXPat->fInitialChars->addAll(s);
2374 numInitialStrings += 2; // Matching on an initial string not possible.
2375 }
2376 currentLen += stringLen;
2377 atStart = FALSE;
2378 }
2379 break;
2380
2381 case URX_CTR_INIT:
2382 case URX_CTR_INIT_NG:
2383 {
2384 // Loop Init Ops. These don't change the min length, but they are 4 word ops
2385 // so location must be updated accordingly.
2386 // Loop Init Ops.
2387 // If the min loop count == 0
2388 // move loc forwards to the end of the loop, skipping over the body.
2389 // If the min count is > 0,
2390 // continue normal processing of the body of the loop.
2391 int32_t loopEndLoc = fRXPat->fCompiledPat->elementAti(loc+1);
2392 loopEndLoc = URX_VAL(loopEndLoc);
2393 int32_t minLoopCount = fRXPat->fCompiledPat->elementAti(loc+2);
2394 if (minLoopCount == 0) {
2395 // Min Loop Count of 0, treat like a forward branch and
2396 // move the current minimum length up to the target
2397 // (end of loop) location.
2398 U_ASSERT(loopEndLoc <= end+1);
2399 if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
2400 forwardedLength.setElementAt(currentLen, loopEndLoc);
2401 }
2402 }
2403 loc+=3; // Skips over operands of CTR_INIT
2404 }
2405 atStart = FALSE;
2406 break;
2407
2408
2409 case URX_CTR_LOOP:
2410 case URX_CTR_LOOP_NG:
2411 // Loop ops.
2412 // The jump is conditional, backwards only.
2413 atStart = FALSE;
2414 break;
2415
2416 case URX_LOOP_C:
2417 // More loop ops. These state-save to themselves.
2418 // don't change the minimum match
2419 atStart = FALSE;
2420 break;
2421
2422
2423 case URX_LA_START:
2424 case URX_LB_START:
2425 {
2426 // Look-around. Scan forward until the matching look-ahead end,
2427 // without processing the look-around block. This is overly pessimistic.
2428 int32_t depth = 0;
2429 for (;;) {
2430 loc++;
2431 op = fRXPat->fCompiledPat->elementAti(loc);
2432 if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) {
2433 depth++;
2434 }
2435 if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
2436 if (depth == 0) {
2437 break;
2438 }
2439 depth--;
2440 }
2441 if (URX_TYPE(op) == URX_STATE_SAVE) {
2442 // Need this because neg lookahead blocks will FAIL to outside
2443 // of the block.
2444 int32_t jmpDest = URX_VAL(op);
2445 if (jmpDest > loc) {
2446 if (currentLen < forwardedLength.elementAti(jmpDest)) {
2447 forwardedLength.setElementAt(currentLen, jmpDest);
2448 }
2449 }
2450 }
2451 U_ASSERT(loc <= end);
2452 }
2453 }
2454 break;
2455
2456 case URX_LA_END:
2457 case URX_LB_CONT:
2458 case URX_LB_END:
2459 case URX_LBN_CONT:
2460 case URX_LBN_END:
2461 U_ASSERT(FALSE); // Shouldn't get here. These ops should be
2462 // consumed by the scan in URX_LA_START and LB_START
2463
2464 break;
2465
2466 default:
2467 U_ASSERT(FALSE);
2468 }
2469
2470 }
2471
2472
2473 // We have finished walking through the ops. Check whether some forward jump
2474 // propagated a shorter length to location end+1.
2475 if (forwardedLength.elementAti(end+1) < currentLen) {
2476 currentLen = forwardedLength.elementAti(end+1);
2477 }
2478
2479
2480 fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
2481
2482
2483 // Sort out what we should check for when looking for candidate match start positions.
2484 // In order of preference,
2485 // 1. Start of input text buffer.
2486 // 2. A literal string.
2487 // 3. Start of line in multi-line mode.
2488 // 4. A single literal character.
2489 // 5. A character from a set of characters.
2490 //
2491 if (fRXPat->fStartType == START_START) {
2492 // Match only at the start of an input text string.
2493 // start type is already set. We're done.
2494 } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) {
2495 // Match beginning only with a literal string.
2496 UChar32 c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
2497 U_ASSERT(fRXPat->fInitialChars->contains(c));
2498 fRXPat->fStartType = START_STRING;
2499 fRXPat->fInitialChar = c;
2500 } else if (fRXPat->fStartType == START_LINE) {
2501 // Match at start of line in Multi-Line mode.
2502 // Nothing to do here; everything is already set.
2503 } else if (fRXPat->fMinMatchLen == 0) {
2504 // Zero length match possible. We could start anywhere.
2505 fRXPat->fStartType = START_NO_INFO;
2506 } else if (fRXPat->fInitialChars->size() == 1) {
2507 // All matches begin with the same char.
2508 fRXPat->fStartType = START_CHAR;
2509 fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
2510 U_ASSERT(fRXPat->fInitialChar != (UChar32)-1);
2511 } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE &&
2512 fRXPat->fMinMatchLen > 0) {
2513 // Matches start with a set of character smaller than the set of all chars.
2514 fRXPat->fStartType = START_SET;
2515 } else {
2516 // Matches can start with anything
2517 fRXPat->fStartType = START_NO_INFO;
2518 }
2519
2520 return;
2521 }
2522
2523
2524
2525 //------------------------------------------------------------------------------
2526 //
2527 // minMatchLength Calculate the length of the shortest string that could
2528 // match the specified pattern.
2529 // Length is in 16 bit code units, not code points.
2530 //
2531 // The calculated length may not be exact. The returned
2532 // value may be shorter than the actual minimum; it must
2533 // never be longer.
2534 //
2535 // start and end are the range of p-code operations to be
2536 // examined. The endpoints are included in the range.
2537 //
2538 //------------------------------------------------------------------------------
2539 int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
2540 if (U_FAILURE(*fStatus)) {
2541 return 0;
2542 }
2543
2544 U_ASSERT(start <= end);
2545 U_ASSERT(end < fRXPat->fCompiledPat->size());
2546
2547
2548 int32_t loc;
2549 int32_t op;
2550 int32_t opType;
2551 int32_t currentLen = 0;
2552
2553
2554 // forwardedLength is a vector holding minimum-match-length values that
2555 // are propagated forward in the pattern by JMP or STATE_SAVE operations.
2556 // It must be one longer than the pattern being checked because some ops
2557 // will jmp to a end-of-block+1 location from within a block, and we must
2558 // count those when checking the block.
2559 UVector32 forwardedLength(end+2, *fStatus);
2560 forwardedLength.setSize(end+2);
2561 for (loc=start; loc<=end+1; loc++) {
2562 forwardedLength.setElementAt(INT32_MAX, loc);
2563 }
2564
2565 for (loc = start; loc<=end; loc++) {
2566 op = fRXPat->fCompiledPat->elementAti(loc);
2567 opType = URX_TYPE(op);
2568
2569 // The loop is advancing linearly through the pattern.
2570 // If the op we are now at was the destination of a branch in the pattern,
2571 // and that path has a shorter minimum length than the current accumulated value,
2572 // replace the current accumulated value.
2573 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2574 if (forwardedLength.elementAti(loc) < currentLen) {
2575 currentLen = forwardedLength.elementAti(loc);
2576 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2577 }
2578
2579 switch (opType) {
2580 // Ops that don't change the total length matched
2581 case URX_RESERVED_OP:
2582 case URX_END:
2583 case URX_STRING_LEN:
2584 case URX_NOP:
2585 case URX_START_CAPTURE:
2586 case URX_END_CAPTURE:
2587 case URX_BACKSLASH_B:
2588 case URX_BACKSLASH_BU:
2589 case URX_BACKSLASH_G:
2590 case URX_BACKSLASH_Z:
2591 case URX_CARET:
2592 case URX_DOLLAR:
2593 case URX_RELOC_OPRND:
2594 case URX_STO_INP_LOC:
2595 case URX_DOLLAR_M:
2596 case URX_CARET_M:
2597 case URX_BACKTRACK:
2598 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
2599 case URX_BACKREF_I:
2600
2601 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
2602 case URX_LD_SP:
2603
2604 case URX_JMP_SAV:
2605 case URX_JMP_SAV_X:
2606 break;
2607
2608
2609 // Ops that match a minimum of one character (one or two 16 bit code units.)
2610 //
2611 case URX_ONECHAR:
2612 case URX_STATIC_SETREF:
2613 case URX_STAT_SETREF_N:
2614 case URX_SETREF:
2615 case URX_BACKSLASH_D:
2616 case URX_ONECHAR_I:
2617 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
2618 case URX_DOTANY_ALL: // . matches one or two.
2619 case URX_DOTANY:
2620 case URX_DOTANY_PL:
2621 case URX_DOTANY_ALL_PL:
2622 currentLen++;
2623 break;
2624
2625
2626 case URX_JMPX:
2627 loc++; // URX_JMPX has an extra operand, ignored here,
2628 // otherwise processed identically to URX_JMP.
2629 case URX_JMP:
2630 {
2631 int32_t jmpDest = URX_VAL(op);
2632 if (jmpDest < loc) {
2633 // Loop of some kind. Can safely ignore, the worst that will happen
2634 // is that we understate the true minimum length
2635 currentLen = forwardedLength.elementAti(loc+1);
2636 } else {
2637 // Forward jump. Propagate the current min length to the target loc of the jump.
2638 U_ASSERT(jmpDest <= end+1);
2639 if (forwardedLength.elementAti(jmpDest) > currentLen) {
2640 forwardedLength.setElementAt(currentLen, jmpDest);
2641 }
2642 }
2643 }
2644 break;
2645
2646 case URX_FAIL:
2647 {
2648 // Fails are kind of like a branch, except that the min length was
2649 // propagated already, by the state save.
2650 currentLen = forwardedLength.elementAti(loc+1);
2651 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2652 }
2653 break;
2654
2655
2656 case URX_STATE_SAVE:
2657 {
2658 // State Save, for forward jumps, propagate the current minimum.
2659 // of the state save.
2660 int32_t jmpDest = URX_VAL(op);
2661 if (jmpDest > loc) {
2662 if (currentLen < forwardedLength.elementAti(jmpDest)) {
2663 forwardedLength.setElementAt(currentLen, jmpDest);
2664 }
2665 }
2666 }
2667 break;
2668
2669
2670 case URX_STRING:
2671 case URX_STRING_I:
2672 {
2673 loc++;
2674 int32_t stringLenOp = fRXPat->fCompiledPat->elementAti(loc);
2675 currentLen += URX_VAL(stringLenOp);
2676 }
2677 break;
2678
2679
2680 case URX_CTR_INIT:
2681 case URX_CTR_INIT_NG:
2682 {
2683 // Loop Init Ops.
2684 // If the min loop count == 0
2685 // move loc forwards to the end of the loop, skipping over the body.
2686 // If the min count is > 0,
2687 // continue normal processing of the body of the loop.
2688 int32_t loopEndLoc = fRXPat->fCompiledPat->elementAti(loc+1);
2689 loopEndLoc = URX_VAL(loopEndLoc);
2690 int32_t minLoopCount = fRXPat->fCompiledPat->elementAti(loc+2);
2691 if (minLoopCount == 0) {
2692 loc = loopEndLoc;
2693 } else {
2694 loc+=3; // Skips over operands of CTR_INIT
2695 }
2696 }
2697 break;
2698
2699
2700 case URX_CTR_LOOP:
2701 case URX_CTR_LOOP_NG:
2702 // Loop ops.
2703 // The jump is conditional, backwards only.
2704 break;
2705
2706 case URX_LOOP_SR_I:
2707 case URX_LOOP_DOT_I:
2708 case URX_LOOP_C:
2709 // More loop ops. These state-save to themselves.
2710 // don't change the minimum match - could match nothing at all.
2711 break;
2712
2713
2714 case URX_LA_START:
2715 case URX_LB_START:
2716 {
2717 // Look-around. Scan forward until the matching look-ahead end,
2718 // without processing the look-around block. This is overly pessimistic.
2719 // TODO: Positive lookahead could recursively do the block, then continue
2720 // with the longer of the block or the value coming in.
2721 int32_t depth = 0;
2722 for (;;) {
2723 loc++;
2724 op = fRXPat->fCompiledPat->elementAti(loc);
2725 if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) {
2726 depth++;
2727 }
2728 if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
2729 if (depth == 0) {
2730 break;
2731 }
2732 depth--;
2733 }
2734 if (URX_TYPE(op) == URX_STATE_SAVE) {
2735 // Need this because neg lookahead blocks will FAIL to outside
2736 // of the block.
2737 int32_t jmpDest = URX_VAL(op);
2738 if (jmpDest > loc) {
2739 if (currentLen < forwardedLength.elementAti(jmpDest)) {
2740 forwardedLength.setElementAt(currentLen, jmpDest);
2741 }
2742 }
2743 }
2744
2745 U_ASSERT(loc <= end);
2746 }
2747 }
2748 break;
2749
2750 case URX_LA_END:
2751 case URX_LB_CONT:
2752 case URX_LB_END:
2753 case URX_LBN_CONT:
2754 case URX_LBN_END:
2755 // Only come here if the matching URX_LA_START or URX_LB_START was not in the
2756 // range being sized, which happens when measuring size of look-behind blocks.
2757 break;
2758
2759 default:
2760 U_ASSERT(FALSE);
2761 }
2762
2763 }
2764
2765 // We have finished walking through the ops. Check whether some forward jump
2766 // propagated a shorter length to location end+1.
2767 if (forwardedLength.elementAti(end+1) < currentLen) {
2768 currentLen = forwardedLength.elementAti(end+1);
2769 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2770 }
2771
2772 return currentLen;
2773 }
2774
2775
2776
2777 //------------------------------------------------------------------------------
2778 //
2779 // maxMatchLength Calculate the length of the longest string that could
2780 // match the specified pattern.
2781 // Length is in 16 bit code units, not code points.
2782 //
2783 // The calculated length may not be exact. The returned
2784 // value may be longer than the actual maximum; it must
2785 // never be shorter.
2786 //
2787 //------------------------------------------------------------------------------
2788 int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
2789 if (U_FAILURE(*fStatus)) {
2790 return 0;
2791 }
2792 U_ASSERT(start <= end);
2793 U_ASSERT(end < fRXPat->fCompiledPat->size());
2794
2795
2796 int32_t loc;
2797 int32_t op;
2798 int32_t opType;
2799 int32_t currentLen = 0;
2800 UVector32 forwardedLength(end+1, *fStatus);
2801 forwardedLength.setSize(end+1);
2802
2803 for (loc=start; loc<=end; loc++) {
2804 forwardedLength.setElementAt(0, loc);
2805 }
2806
2807 for (loc = start; loc<=end; loc++) {
2808 op = fRXPat->fCompiledPat->elementAti(loc);
2809 opType = URX_TYPE(op);
2810
2811 // The loop is advancing linearly through the pattern.
2812 // If the op we are now at was the destination of a branch in the pattern,
2813 // and that path has a longer maximum length than the current accumulated value,
2814 // replace the current accumulated value.
2815 if (forwardedLength.elementAti(loc) > currentLen) {
2816 currentLen = forwardedLength.elementAti(loc);
2817 }
2818
2819 switch (opType) {
2820 // Ops that don't change the total length matched
2821 case URX_RESERVED_OP:
2822 case URX_END:
2823 case URX_STRING_LEN:
2824 case URX_NOP:
2825 case URX_START_CAPTURE:
2826 case URX_END_CAPTURE:
2827 case URX_BACKSLASH_B:
2828 case URX_BACKSLASH_BU:
2829 case URX_BACKSLASH_G:
2830 case URX_BACKSLASH_Z:
2831 case URX_CARET:
2832 case URX_DOLLAR:
2833 case URX_RELOC_OPRND:
2834 case URX_STO_INP_LOC:
2835 case URX_DOLLAR_M:
2836 case URX_CARET_M:
2837 case URX_BACKTRACK:
2838
2839 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
2840 case URX_LD_SP:
2841
2842 case URX_LB_END:
2843 case URX_LB_CONT:
2844 case URX_LBN_CONT:
2845 case URX_LBN_END:
2846 break;
2847
2848
2849 // Ops that increase that cause an unbounded increase in the length
2850 // of a matched string, or that increase it a hard to characterize way.
2851 // Call the max length unbounded, and stop further checking.
2852 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
2853 case URX_BACKREF_I:
2854 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
2855 case URX_DOTANY_PL:
2856 case URX_DOTANY_ALL_PL:
2857 currentLen = INT32_MAX;
2858 break;
2859
2860
2861 // Ops that match a max of one character (possibly two 16 bit code units.)
2862 //
2863 case URX_STATIC_SETREF:
2864 case URX_STAT_SETREF_N:
2865 case URX_SETREF:
2866 case URX_BACKSLASH_D:
2867 case URX_ONECHAR_I:
2868 case URX_DOTANY_ALL:
2869 case URX_DOTANY:
2870 currentLen+=2;
2871 break;
2872
2873 // Single literal character. Increase current max length by one or two,
2874 // depending on whether the char is in the supplementary range.
2875 case URX_ONECHAR:
2876 currentLen++;
2877 if (URX_VAL(op) > 0x10000) {
2878 currentLen++;
2879 }
2880 break;
2881
2882 // Jumps.
2883 //
2884 case URX_JMP:
2885 case URX_JMPX:
2886 case URX_JMP_SAV:
2887 case URX_JMP_SAV_X:
2888 {
2889 int32_t jmpDest = URX_VAL(op);
2890 if (jmpDest < loc) {
2891 // Loop of some kind. Max match length is unbounded.
2892 currentLen = INT32_MAX;
2893 } else {
2894 // Forward jump. Propagate the current min length to the target loc of the jump.
2895 if (forwardedLength.elementAti(jmpDest) < currentLen) {
2896 forwardedLength.setElementAt(currentLen, jmpDest);
2897 }
2898 currentLen = 0;
2899 }
2900 }
2901 break;
2902
2903 case URX_FAIL:
2904 // Fails are kind of like a branch, except that the max length was
2905 // propagated already, by the state save.
2906 currentLen = forwardedLength.elementAti(loc+1);
2907 break;
2908
2909
2910 case URX_STATE_SAVE:
2911 {
2912 // State Save, for forward jumps, propagate the current minimum.
2913 // of the state save.
2914 // For backwards jumps, they create a loop, maximum
2915 // match length is unbounded.
2916 int32_t jmpDest = URX_VAL(op);
2917 if (jmpDest > loc) {
2918 if (currentLen > forwardedLength.elementAti(jmpDest)) {
2919 forwardedLength.setElementAt(currentLen, jmpDest);
2920 }
2921 } else {
2922 currentLen = INT32_MAX;
2923 }
2924 }
2925 break;
2926
2927
2928
2929
2930 case URX_STRING:
2931 case URX_STRING_I:
2932 {
2933 loc++;
2934 int32_t stringLenOp = fRXPat->fCompiledPat->elementAti(loc);
2935 currentLen += URX_VAL(stringLenOp);
2936 }
2937 break;
2938
2939
2940 case URX_CTR_INIT:
2941 case URX_CTR_INIT_NG:
2942 case URX_CTR_LOOP:
2943 case URX_CTR_LOOP_NG:
2944 case URX_LOOP_SR_I:
2945 case URX_LOOP_DOT_I:
2946 case URX_LOOP_C:
2947 // For anything to do with loops, make the match length unbounded.
2948 // Note: INIT instructions are multi-word. Can ignore because
2949 // INT32_MAX length will stop the per-instruction loop.
2950 currentLen = INT32_MAX;
2951 break;
2952
2953
2954
2955 case URX_LA_START:
2956 case URX_LA_END:
2957 // Look-ahead. Just ignore, treat the look-ahead block as if
2958 // it were normal pattern. Gives a too-long match length,
2959 // but good enough for now.
2960 break;
2961
2962 // End of look-ahead ops should always be consumed by the processing at
2963 // the URX_LA_START op.
2964 // U_ASSERT(FALSE);
2965 // break;
2966
2967 case URX_LB_START:
2968 {
2969 // Look-behind. Scan forward until the matching look-around end,
2970 // without processing the look-behind block.
2971 int32_t depth = 0;
2972 for (;;) {
2973 loc++;
2974 op = fRXPat->fCompiledPat->elementAti(loc);
2975 if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) {
2976 depth++;
2977 }
2978 if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
2979 if (depth == 0) {
2980 break;
2981 }
2982 depth--;
2983 }
2984 U_ASSERT(loc < end);
2985 }
2986 }
2987 break;
2988
2989 default:
2990 U_ASSERT(FALSE);
2991 }
2992
2993
2994 if (currentLen == INT32_MAX) {
2995 // The maximum length is unbounded.
2996 // Stop further processing of the pattern.
2997 break;
2998 }
2999
3000 }
3001 return currentLen;
3002
3003 }
3004
3005
3006 //------------------------------------------------------------------------------
3007 //
3008 // stripNOPs Remove any NOP operations from the compiled pattern code.
3009 // Extra NOPs are inserted for some constructs during the initial
3010 // code generation to provide locations that may be patched later.
3011 // Many end up unneeded, and are removed by this function.
3012 //
3013 //------------------------------------------------------------------------------
3014 void RegexCompile::stripNOPs() {
3015
3016 if (U_FAILURE(*fStatus)) {
3017 return;
3018 }
3019
3020 int32_t end = fRXPat->fCompiledPat->size();
3021 UVector32 deltas(end, *fStatus);
3022
3023 // Make a first pass over the code, computing the amount that things
3024 // will be offset at each location in the original code.
3025 int32_t loc;
3026 int32_t d = 0;
3027 for (loc=0; loc<end; loc++) {
3028 deltas.addElement(d, *fStatus);
3029 int32_t op = fRXPat->fCompiledPat->elementAti(loc);
3030 if (URX_TYPE(op) == URX_NOP) {
3031 d++;
3032 }
3033 }
3034
3035 // Make a second pass over the code, removing the NOPs by moving following
3036 // code up, and patching operands that refer to code locations that
3037 // are being moved. The array of offsets from the first step is used
3038 // to compute the new operand values.
3039 int32_t src;
3040 int32_t dst = 0;
3041 for (src=0; src<end; src++) {
3042 int32_t op = fRXPat->fCompiledPat->elementAti(src);
3043 int32_t opType = URX_TYPE(op);
3044 switch (opType) {
3045 case URX_NOP:
3046 break;
3047
3048 case URX_STATE_SAVE:
3049 case URX_JMP:
3050 case URX_CTR_LOOP:
3051 case URX_CTR_LOOP_NG:
3052 case URX_RELOC_OPRND:
3053 case URX_JMPX:
3054 case URX_JMP_SAV:
3055 case URX_JMP_SAV_X:
3056 // These are instructions with operands that refer to code locations.
3057 {
3058 int32_t operandAddress = URX_VAL(op);
3059 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size());
3060 int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
3061 op = URX_BUILD(opType, fixedOperandAddress);
3062 fRXPat->fCompiledPat->setElementAt(op, dst);
3063 dst++;
3064 break;
3065 }
3066
3067 case URX_RESERVED_OP:
3068 case URX_RESERVED_OP_N:
3069 case URX_BACKTRACK:
3070 case URX_END:
3071 case URX_ONECHAR:
3072 case URX_STRING:
3073 case URX_STRING_LEN:
3074 case URX_START_CAPTURE:
3075 case URX_END_CAPTURE:
3076 case URX_STATIC_SETREF:
3077 case URX_STAT_SETREF_N:
3078 case URX_SETREF:
3079 case URX_DOTANY:
3080 case URX_FAIL:
3081 case URX_BACKSLASH_B:
3082 case URX_BACKSLASH_BU:
3083 case URX_BACKSLASH_G:
3084 case URX_BACKSLASH_X:
3085 case URX_BACKSLASH_Z:
3086 case URX_DOTANY_ALL:
3087 case URX_DOTANY_ALL_PL:
3088 case URX_DOTANY_PL:
3089 case URX_BACKSLASH_D:
3090 case URX_CARET:
3091 case URX_DOLLAR:
3092 case URX_CTR_INIT:
3093 case URX_CTR_INIT_NG:
3094 case URX_STO_SP:
3095 case URX_LD_SP:
3096 case URX_BACKREF:
3097 case URX_STO_INP_LOC:
3098 case URX_LA_START:
3099 case URX_LA_END:
3100 case URX_ONECHAR_I:
3101 case URX_STRING_I:
3102 case URX_BACKREF_I:
3103 case URX_DOLLAR_M:
3104 case URX_CARET_M:
3105 case URX_LB_START:
3106 case URX_LB_CONT:
3107 case URX_LB_END:
3108 case URX_LBN_CONT:
3109 case URX_LBN_END:
3110 case URX_LOOP_SR_I:
3111 case URX_LOOP_DOT_I:
3112 case URX_LOOP_C:
3113 // These instructions are unaltered by the relocation.
3114 fRXPat->fCompiledPat->setElementAt(op, dst);
3115 dst++;
3116 break;
3117
3118 default:
3119 // Some op is unaccounted for.
3120 U_ASSERT(FALSE);
3121 error(U_REGEX_INTERNAL_ERROR);
3122 }
3123 }
3124
3125 fRXPat->fCompiledPat->setSize(dst);
3126 }
3127
3128
3129
3130
3131 //------------------------------------------------------------------------------
3132 //
3133 // OptDotStar Optimize patterns that end with a '.*' or '.+' to
3134 // just advance the input to the end.
3135 //
3136 // Transform this compiled sequence
3137 // [DOT_ANY | DOT_ANY_ALL]
3138 // JMP_SAV to previous instruction
3139 // [NOP | END_CAPTURE | DOLLAR | BACKSLASH_Z]*
3140 // END
3141 //
3142 // To
3143 // NOP
3144 // [DOT_ANY_PL | DOT_ANY_ALL_PL]
3145 // [NOP | END_CAPTURE | DOLLAR | BACKSLASH_Z]*
3146 // END
3147 //
3148 //------------------------------------------------------------------------------
3149 void RegexCompile::OptDotStar() {
3150 // Scan backwards in the pattern, looking for a JMP_SAV near the end.
3151 int32_t jmpLoc;
3152 int32_t op = 0;
3153 int32_t opType;
3154 for (jmpLoc=fRXPat->fCompiledPat->size(); jmpLoc--;) {
3155 U_ASSERT(jmpLoc>0);
3156 op = fRXPat->fCompiledPat->elementAti(jmpLoc);
3157 opType = URX_TYPE(op);
3158 switch(opType) {
3159
3160
3161 case URX_END:
3162 case URX_NOP:
3163 case URX_END_CAPTURE:
3164 case URX_DOLLAR_M:
3165 case URX_DOLLAR:
3166 case URX_BACKSLASH_Z:
3167 // These ops may follow the JMP_SAV without preventing us from
3168 // doing this optimization.
3169 continue;
3170
3171 case URX_JMP_SAV:
3172 // Got a trailing JMP_SAV that's a candidate for optimization.
3173 break;
3174
3175 default:
3176 // This optimization not possible.
3177 return;
3178 }
3179 break; // from the for loop.
3180 }
3181
3182 // We found in URX_JMP_SAV near the end that is a candidate for optimizing.
3183 // Is the target address the previous instruction?
3184 // Is the previous instruction a flavor of URX_DOTANY
3185 int32_t loopTopLoc = URX_VAL(op);
3186 if (loopTopLoc != jmpLoc-1) {
3187 return;
3188 }
3189 int32_t newOp;
3190 int32_t oldOp = fRXPat->fCompiledPat->elementAti(loopTopLoc);
3191 int32_t oldOpType = opType = URX_TYPE(oldOp);
3192 if (oldOpType == URX_DOTANY) {
3193 newOp = URX_BUILD(URX_DOTANY_PL, 0);
3194 }
3195 else if (oldOpType == URX_DOTANY_ALL) {
3196 newOp = URX_BUILD(URX_DOTANY_ALL_PL, 0);
3197 } else {
3198 return; // Sequence we were looking for isn't there.
3199 }
3200
3201 // Substitute the new instructions into the pattern.
3202 // The NOP will be removed in a later optimization step.
3203 fRXPat->fCompiledPat->setElementAt(URX_BUILD(URX_NOP, 0), loopTopLoc);
3204 fRXPat->fCompiledPat->setElementAt(newOp, jmpLoc);
3205 }
3206
3207
3208 //------------------------------------------------------------------------------
3209 //
3210 // Error Report a rule parse error.
3211 // Only report it if no previous error has been recorded.
3212 //
3213 //------------------------------------------------------------------------------
3214 void RegexCompile::error(UErrorCode e) {
3215 if (U_SUCCESS(*fStatus)) {
3216 *fStatus = e;
3217 fParseErr->line = fLineNum;
3218 fParseErr->offset = fCharNum;
3219
3220 // Fill in the context.
3221 // Note: extractBetween() pins supplied indicies to the string bounds.
3222 uprv_memset(fParseErr->preContext, 0, sizeof(fParseErr->preContext));
3223 uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
3224 fRXPat->fPattern.extractBetween(fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex,
3225 fParseErr->preContext, 0);
3226 fRXPat->fPattern.extractBetween(fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1,
3227 fParseErr->postContext, 0);
3228 }
3229 }
3230
3231
3232 //
3233 // Assorted Unicode character constants.
3234 // Numeric because there is no portable way to enter them as literals.
3235 // (Think EBCDIC).
3236 //
3237 static const UChar chCR = 0x0d; // New lines, for terminating comments.
3238 static const UChar chLF = 0x0a;
3239 static const UChar chNEL = 0x85; // NEL newline variant
3240 static const UChar chLS = 0x2028; // Unicode Line Separator
3241 static const UChar chPound = 0x23; // '#', introduces a comment.
3242 static const UChar chE = 0x45; // 'E'
3243 static const UChar chUpperN = 0x4E;
3244 static const UChar chLowerP = 0x70;
3245 static const UChar chUpperP = 0x50;
3246 static const UChar chBackSlash = 0x5c; // '\' introduces a char escape
3247 static const UChar chLBracket = 0x5b;
3248 static const UChar chRBracket = 0x5d;
3249 static const UChar chRBrace = 0x7d;
3250
3251
3252 //------------------------------------------------------------------------------
3253 //
3254 // nextCharLL Low Level Next Char from the regex pattern.
3255 // Get a char from the string, keep track of input position
3256 // for error reporting.
3257 //
3258 //------------------------------------------------------------------------------
3259 UChar32 RegexCompile::nextCharLL() {
3260 UChar32 ch;
3261 UnicodeString &pattern = fRXPat->fPattern;
3262
3263 if (fPeekChar != -1) {
3264 ch = fPeekChar;
3265 fPeekChar = -1;
3266 return ch;
3267 }
3268 if (fPatternLength==0 || fNextIndex >= fPatternLength) {
3269 return (UChar32)-1;
3270 }
3271 ch = pattern.char32At(fNextIndex);
3272 fNextIndex = pattern.moveIndex32(fNextIndex, 1);
3273
3274 if (ch == chCR ||
3275 ch == chNEL ||
3276 ch == chLS ||
3277 ch == chLF && fLastChar != chCR) {
3278 // Character is starting a new line. Bump up the line number, and
3279 // reset the column to 0.
3280 fLineNum++;
3281 fCharNum=0;
3282 if (fQuoteMode) {
3283 error(U_REGEX_RULE_SYNTAX);
3284 fQuoteMode = FALSE;
3285 }
3286 }
3287 else {
3288 // Character is not starting a new line. Except in the case of a
3289 // LF following a CR, increment the column position.
3290 if (ch != chLF) {
3291 fCharNum++;
3292 }
3293 }
3294 fLastChar = ch;
3295 return ch;
3296 }
3297
3298 //------------------------------------------------------------------------------
3299 //
3300 // peekCharLL Low Level Character Scanning, sneak a peek at the next
3301 // character without actually getting it.
3302 //
3303 //------------------------------------------------------------------------------
3304 UChar32 RegexCompile::peekCharLL() {
3305 if (fPeekChar == -1) {
3306 fPeekChar = nextCharLL();
3307 }
3308 return fPeekChar;
3309 }
3310
3311
3312 //------------------------------------------------------------------------------
3313 //
3314 // nextChar for pattern scanning. At this level, we handle stripping
3315 // out comments and processing some backslash character escapes.
3316 // The rest of the pattern grammar is handled at the next level up.
3317 //
3318 //------------------------------------------------------------------------------
3319 void RegexCompile::nextChar(RegexPatternChar &c) {
3320
3321 fScanIndex = fNextIndex;
3322 c.fChar = nextCharLL();
3323 c.fQuoted = FALSE;
3324
3325 if (fQuoteMode) {
3326 c.fQuoted = TRUE;
3327 if ((c.fChar==chBackSlash && peekCharLL()==chE) || c.fChar == (UChar32)-1) {
3328 fQuoteMode = FALSE; // Exit quote mode,
3329 nextCharLL(); // discard the E
3330 nextChar(c); // recurse to get the real next char
3331 }
3332 }
3333 else if (fInBackslashQuote) {
3334 // The current character immediately follows a '\'
3335 // Don't check for any further escapes, just return it as-is.
3336 // Don't set c.fQuoted, because that would prevent the state machine from
3337 // dispatching on the character.
3338 fInBackslashQuote = FALSE;
3339 }
3340 else
3341 {
3342 // We are not in a \Q quoted region \E of the source.
3343 //
3344 if (fModeFlags & UREGEX_COMMENTS) {
3345 //
3346 // We are in free-spacing and comments mode.
3347 // Scan through any white space and comments, until we
3348 // reach a significant character or the end of inut.
3349 for (;;) {
3350 if (c.fChar == (UChar32)-1) {
3351 break; // End of Input
3352 }
3353 if (c.fChar == chPound && fEOLComments == TRUE) {
3354 // Start of a comment. Consume the rest of it, until EOF or a new line
3355 for (;;) {
3356 c.fChar = nextCharLL();
3357 if (c.fChar == (UChar32)-1 || // EOF
3358 c.fChar == chCR ||
3359 c.fChar == chLF ||
3360 c.fChar == chNEL ||
3361 c.fChar == chLS) {
3362 break;
3363 }
3364 }
3365 }
3366 if (uprv_isRuleWhiteSpace(c.fChar) == FALSE) {
3367 break;
3368 }
3369 c.fChar = nextCharLL();
3370 }
3371 }
3372
3373 //
3374 // check for backslash escaped characters.
3375 //
3376 int32_t startX = fNextIndex; // start and end positions of the
3377 int32_t endX = fNextIndex; // sequence following the '\'
3378 if (c.fChar == chBackSlash) {
3379 if (RegexStaticSets::gStaticSets->fUnescapeCharSet->contains(peekCharLL())) {
3380 //
3381 // A '\' sequence that is handled by ICU's standard unescapeAt function.
3382 // Includes \uxxxx, \n, \r, many others.
3383 // Return the single equivalent character.
3384 //
3385 nextCharLL(); // get & discard the peeked char.
3386 c.fQuoted = TRUE;
3387 c.fChar = fRXPat->fPattern.unescapeAt(endX);
3388 if (startX == endX) {
3389 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
3390 }
3391 fCharNum += endX - startX;
3392 fNextIndex = endX;
3393 }
3394 else
3395 {
3396 // We are in a '\' escape that will be handled by the state table scanner.
3397 // Just return the backslash, but remember that the following char is to
3398 // be taken literally. TODO: this is awkward, think about alternatives.
3399 fInBackslashQuote = TRUE;
3400 }
3401 }
3402 }
3403
3404 // re-enable # to end-of-line comments, in case they were disabled.
3405 // They are disabled by the parser upon seeing '(?', but this lasts for
3406 // the fetching of the next character only.
3407 fEOLComments = TRUE;
3408
3409 // putc(c.fChar, stdout);
3410 }
3411
3412
3413
3414 //------------------------------------------------------------------------------
3415 //
3416 // scanSet Construct a UnicodeSet from the text at the current scan
3417 // position. Advance the scan position to the first character
3418 // after the set.
3419 //
3420 // The scan position is normally under the control of the state machine
3421 // that controls pattern parsing. UnicodeSets, however, are parsed by
3422 // the UnicodeSet constructor, not by the Regex pattern parser.
3423 //
3424 //------------------------------------------------------------------------------
3425 UnicodeSet *RegexCompile::scanSet() {
3426 UnicodeSet *uset = NULL;
3427 ParsePosition pos;
3428 int i;
3429
3430 if (U_FAILURE(*fStatus)) {
3431 return NULL;
3432 }
3433
3434 pos.setIndex(fScanIndex);
3435 UErrorCode localStatus = U_ZERO_ERROR;
3436 uint32_t usetFlags = 0;
3437 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
3438 usetFlags |= USET_CASE_INSENSITIVE;
3439 }
3440 if (fModeFlags & UREGEX_COMMENTS) {
3441 usetFlags |= USET_IGNORE_SPACE;
3442 }
3443
3444 uset = new UnicodeSet(fRXPat->fPattern, pos,
3445 usetFlags, NULL, localStatus);
3446 if (U_FAILURE(localStatus)) {
3447 // TODO: Get more accurate position of the error from UnicodeSet's return info.
3448 // UnicodeSet appears to not be reporting correctly at this time.
3449 REGEX_SCAN_DEBUG_PRINTF(("UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex()));
3450 error(localStatus);
3451 delete uset;
3452 return NULL;
3453 }
3454
3455 // Advance the current scan postion over the UnicodeSet.
3456 // Don't just set fScanIndex because the line/char positions maintained
3457 // for error reporting would be thrown off.
3458 i = pos.getIndex();
3459 for (;;) {
3460 if (fNextIndex >= i) {
3461 break;
3462 }
3463 nextCharLL();
3464 }
3465
3466 return uset;
3467 }
3468
3469
3470 //------------------------------------------------------------------------------
3471 //
3472 // scanProp Construct a UnicodeSet from the text at the current scan
3473 // position, which will be of the form \p{whaterver}
3474 //
3475 // The scan position will be at the 'p' or 'P'. On return
3476 // the scan position should be just after the '}'
3477 //
3478 // Return a UnicodeSet, constructed from the \P pattern,
3479 // or NULL if the pattern is invalid.
3480 //
3481 //------------------------------------------------------------------------------
3482 UnicodeSet *RegexCompile::scanProp() {
3483 UnicodeSet *uset = NULL;
3484
3485 if (U_FAILURE(*fStatus)) {
3486 return NULL;
3487 }
3488
3489 U_ASSERT(fC.fChar == chLowerP || fC.fChar == chUpperP || fC.fChar == chUpperN);
3490
3491 // enclose the \p{property} from the regex pattern source in [brackets]
3492 UnicodeString setPattern;
3493 setPattern.append(chLBracket);
3494 setPattern.append(chBackSlash);
3495 for (;;) {
3496 setPattern.append(fC.fChar);
3497 if (fC.fChar == chRBrace) {
3498 break;
3499 }
3500 nextChar(fC);
3501 if (fC.fChar == -1) {
3502 // Hit the end of the input string without finding the closing '}'
3503 error(U_REGEX_PROPERTY_SYNTAX);
3504 return NULL;
3505 }
3506 }
3507 setPattern.append(chRBracket);
3508
3509 uint32_t usetFlags = 0;
3510 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
3511 usetFlags |= USET_CASE_INSENSITIVE;
3512 }
3513 if (fModeFlags & UREGEX_COMMENTS) {
3514 usetFlags |= USET_IGNORE_SPACE;
3515 }
3516
3517 // Build the UnicodeSet from the set pattern we just built up in a string.
3518 uset = new UnicodeSet(setPattern, usetFlags, NULL, *fStatus);
3519 if (U_FAILURE(*fStatus)) {
3520 delete uset;
3521 uset = NULL;
3522 }
3523
3524 nextChar(fC); // Continue overall regex pattern processing with char after the '}'
3525 return uset;
3526 }
3527
3528 U_NAMESPACE_END
3529 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
3530